CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 213 results for author: <span class="mathjax">Yao, T</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Yao%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yao, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yao%2C+T&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yao, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05335">arXiv:2411.05335</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05335">pdf</a>, <a href="https://arxiv.org/format/2411.05335">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Quality-Centric Framework for Generic Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+W">Wentang Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yuzhen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Changsheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yandan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05335v1-abstract-short" style="display: inline;"> This paper addresses the generalization issue in deepfake detection by harnessing forgery quality in training data. Generally, the forgery quality of different deepfakes varies: some have easily recognizable forgery clues, while others are highly realistic. Existing works often train detectors on a mix of deepfakes with varying forgery qualities, potentially leading detectors to short-cut the easy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05335v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05335v1-abstract-full" style="display: none;"> This paper addresses the generalization issue in deepfake detection by harnessing forgery quality in training data. Generally, the forgery quality of different deepfakes varies: some have easily recognizable forgery clues, while others are highly realistic. Existing works often train detectors on a mix of deepfakes with varying forgery qualities, potentially leading detectors to short-cut the easy-to-spot artifacts from low-quality forgery samples, thereby hurting generalization performance. To tackle this issue, we propose a novel quality-centric framework for generic deepfake detection, which is composed of a Quality Evaluator, a low-quality data enhancement module, and a learning pacing strategy that explicitly incorporates forgery quality into the training process. The framework is inspired by curriculum learning, which is designed to gradually enable the detector to learn more challenging deepfake samples, starting with easier samples and progressing to more realistic ones. We employ both static and dynamic assessments to assess the forgery quality, combining their scores to produce a final rating for each training sample. The rating score guides the selection of deepfake samples for training, with higher-rated samples having a higher probability of being chosen. Furthermore, we propose a novel frequency data augmentation method specifically designed for low-quality forgery samples, which helps to reduce obvious forgery traces and improve their overall realism. Extensive experiments show that our method can be applied in a plug-and-play manner and significantly enhance the generalization performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05335v1-abstract-full').style.display = 'none'; document.getElementById('2411.05335v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02592">arXiv:2411.02592</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02592">pdf</a>, <a href="https://arxiv.org/format/2411.02592">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Data Augmentation for Improving Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Ruoxin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Ke-Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jiamu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shouli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02592v1-abstract-short" style="display: inline;"> Recent advancements in image mixing and generative data augmentation have shown promise in enhancing image classification. However, these techniques face the challenge of balancing semantic fidelity with diversity. Specifically, image mixing involves interpolating two images to create a new one, but this pixel-level interpolation can compromise fidelity. Generative augmentation uses text-to-image&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02592v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02592v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02592v1-abstract-full" style="display: none;"> Recent advancements in image mixing and generative data augmentation have shown promise in enhancing image classification. However, these techniques face the challenge of balancing semantic fidelity with diversity. Specifically, image mixing involves interpolating two images to create a new one, but this pixel-level interpolation can compromise fidelity. Generative augmentation uses text-to-image generative models to synthesize or modify images, often limiting diversity to avoid generating out-of-distribution data that potentially affects accuracy. We propose that this fidelity-diversity dilemma partially stems from the whole-image paradigm of existing methods. Since an image comprises the class-dependent part (CDP) and the class-independent part (CIP), where each part has fundamentally different impacts on the image&#39;s fidelity, treating different parts uniformly can therefore be misleading. To address this fidelity-diversity dilemma, we introduce Decoupled Data Augmentation (De-DA), which resolves the dilemma by separating images into CDPs and CIPs and handling them adaptively. To maintain fidelity, we use generative models to modify real CDPs under controlled conditions, preserving semantic consistency. To enhance diversity, we replace the image&#39;s CIP with inter-class variants, creating diverse CDP-CIP combinations. Additionally, we implement an online randomized combination strategy during training to generate numerous distinct CDP-CIP combinations cost-effectively. Comprehensive empirical evaluations validate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02592v1-abstract-full').style.display = 'none'; document.getElementById('2411.02592v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00078">arXiv:2411.00078</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00078">pdf</a>, <a href="https://arxiv.org/format/2411.00078">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> How Good Are We? Evaluating Cell AI Foundation Models in Kidney Pathology with Human-in-the-Loop Enrichment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Junlin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+S">Siqi Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+Z">Zhewen Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yizhe Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lionts%2C+M">Marilyn Lionts</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+C">Catie Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Wilkes%2C+M">Mitchell Wilkes</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00078v1-abstract-short" style="display: inline;"> Training AI foundation models has emerged as a promising large-scale learning approach for addressing real-world healthcare challenges, including digital pathology. While many of these models have been developed for tasks like disease diagnosis and tissue quantification using extensive and diverse training datasets, their readiness for deployment on some arguably simplest tasks, such as nuclei seg&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00078v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00078v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00078v1-abstract-full" style="display: none;"> Training AI foundation models has emerged as a promising large-scale learning approach for addressing real-world healthcare challenges, including digital pathology. While many of these models have been developed for tasks like disease diagnosis and tissue quantification using extensive and diverse training datasets, their readiness for deployment on some arguably simplest tasks, such as nuclei segmentation within a single organ (e.g., the kidney), remains uncertain. This paper seeks to answer this key question, &#34;How good are we?&#34;, by thoroughly evaluating the performance of recent cell foundation models on a curated multi-center, multi-disease, and multi-species external testing dataset. Additionally, we tackle a more challenging question, &#34;How can we improve?&#34;, by developing and assessing human-in-the-loop data enrichment strategies aimed at enhancing model performance while minimizing the reliance on pixel-level human annotation. To address the first question, we curated a multicenter, multidisease, and multispecies dataset consisting of 2,542 kidney whole slide images (WSIs). Three state-of-the-art (SOTA) cell foundation models-Cellpose, StarDist, and CellViT-were selected for evaluation. To tackle the second question, we explored data enrichment algorithms by distilling predictions from the different foundation models with a human-in-the-loop framework, aiming to further enhance foundation model performance with minimal human efforts. Our experimental results showed that all three foundation models improved over their baselines with model fine-tuning with enriched data. Interestingly, the baseline model with the highest F1 score does not yield the best segmentation outcomes after fine-tuning. This study establishes a benchmark for the development and deployment of cell vision foundation models tailored for real-world data applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00078v1-abstract-full').style.display = 'none'; document.getElementById('2411.00078v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04372">arXiv:2410.04372</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04372">pdf</a>, <a href="https://arxiv.org/format/2410.04372">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffusionFake: Enhancing Generalization in Deepfake Detection via Guided Stable Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+K">Ke Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+X">Xiaoshuai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04372v1-abstract-short" style="display: inline;"> The rapid progress of Deepfake technology has made face swapping highly realistic, raising concerns about the malicious use of fabricated facial content. Existing methods often struggle to generalize to unseen domains due to the diverse nature of facial manipulations. In this paper, we revisit the generation process and identify a universal principle: Deepfake images inherently contain information&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04372v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04372v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04372v1-abstract-full" style="display: none;"> The rapid progress of Deepfake technology has made face swapping highly realistic, raising concerns about the malicious use of fabricated facial content. Existing methods often struggle to generalize to unseen domains due to the diverse nature of facial manipulations. In this paper, we revisit the generation process and identify a universal principle: Deepfake images inherently contain information from both source and target identities, while genuine faces maintain a consistent identity. Building upon this insight, we introduce DiffusionFake, a novel plug-and-play framework that reverses the generative process of face forgeries to enhance the generalization of detection models. DiffusionFake achieves this by injecting the features extracted by the detection model into a frozen pre-trained Stable Diffusion model, compelling it to reconstruct the corresponding target and source images. This guided reconstruction process constrains the detection network to capture the source and target related features to facilitate the reconstruction, thereby learning rich and disentangled representations that are more resilient to unseen forgeries. Extensive experiments demonstrate that DiffusionFake significantly improves cross-domain generalization of various detector architectures without introducing additional parameters during inference. Our Codes are available in https://github.com/skJack/DiffusionFake.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04372v1-abstract-full').style.display = 'none'; document.getElementById('2410.04372v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01148">arXiv:2410.01148</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01148">pdf</a>, <a href="https://arxiv.org/format/2410.01148">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic Image Unfolding and Stitching Framework for Esophageal Lining Video Based on Density-Weighted Feature Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Muyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Tyree%2C+R+N">Regina N Tyree</a>, <a href="/search/cs?searchtype=author&amp;query=Hiremath%2C+G">Girish Hiremath</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01148v1-abstract-short" style="display: inline;"> Endoscopy is a crucial tool for diagnosing the gastrointestinal tract, but its effectiveness is often limited by a narrow field of view and the dynamic nature of the internal environment, especially in the esophagus, where complex and repetitive patterns make image stitching challenging. This paper introduces a novel automatic image unfolding and stitching framework tailored for esophageal videos&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01148v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01148v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01148v1-abstract-full" style="display: none;"> Endoscopy is a crucial tool for diagnosing the gastrointestinal tract, but its effectiveness is often limited by a narrow field of view and the dynamic nature of the internal environment, especially in the esophagus, where complex and repetitive patterns make image stitching challenging. This paper introduces a novel automatic image unfolding and stitching framework tailored for esophageal videos captured during endoscopy. The method combines feature matching algorithms, including LoFTR, SIFT, and ORB, to create a feature filtering pool and employs a Density-Weighted Homography Optimization (DWHO) algorithm to enhance stitching accuracy. By merging consecutive frames, the framework generates a detailed panoramic view of the esophagus, enabling thorough and accurate visual analysis. Experimental results show the framework achieves low Root Mean Square Error (RMSE) and high Structural Similarity Index (SSIM) across extensive video sequences, demonstrating its potential for clinical use and improving the quality and continuity of endoscopic visual data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01148v1-abstract-full').style.display = 'none'; document.getElementById('2410.01148v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19947">arXiv:2409.19947</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19947">pdf</a>, <a href="https://arxiv.org/format/2409.19947">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Classification with a Network of Partially Informative Agents: Enabling Wise Crowds from Individually Myopic Classifiers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tong Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Sundaram%2C+S">Shreyas Sundaram</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19947v1-abstract-short" style="display: inline;"> We consider the problem of classification with a (peer-to-peer) network of heterogeneous and partially informative agents, each receiving local data generated by an underlying true class, and equipped with a classifier that can only distinguish between a subset of the entire set of classes. We propose an iterative algorithm that uses the posterior probabilities of the local classifier and recursiv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19947v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19947v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19947v1-abstract-full" style="display: none;"> We consider the problem of classification with a (peer-to-peer) network of heterogeneous and partially informative agents, each receiving local data generated by an underlying true class, and equipped with a classifier that can only distinguish between a subset of the entire set of classes. We propose an iterative algorithm that uses the posterior probabilities of the local classifier and recursively updates each agent&#39;s local belief on all the possible classes, based on its local signals and belief information from its neighbors. We then adopt a novel distributed min-rule to update each agent&#39;s global belief and enable learning of the true class for all agents. We show that under certain assumptions, the beliefs on the true class converge to one asymptotically almost surely. We provide the asymptotic convergence rate, and demonstrate the performance of our algorithm through simulation with image data and experimented with random forest classifiers and MobileNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19947v1-abstract-full').style.display = 'none'; document.getElementById('2409.19947v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 15 figures, 60th Annual Allerton Conference on Communication, Control, and Computing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13846">arXiv:2409.13846</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13846">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modality Conditioned Variational U-Net for Field-of-View Extension in Brain Diffusion MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Kanakaraj%2C+P">Praitayini Kanakaraj</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chenyu Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+S">Shunxing Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+L">Lianrui Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M+E">Michael E. Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Newlin%2C+N+R">Nancy R. Newlin</a>, <a href="/search/cs?searchtype=author&amp;query=Rudravaram%2C+G">Gaurav Rudravaram</a>, <a href="/search/cs?searchtype=author&amp;query=Khairi%2C+N+M">Nazirah M. Khairi</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Schilling%2C+K+G">Kurt G. Schilling</a>, <a href="/search/cs?searchtype=author&amp;query=Kukull%2C+W+A">Walter A. Kukull</a>, <a href="/search/cs?searchtype=author&amp;query=Toga%2C+A+W">Arthur W. Toga</a>, <a href="/search/cs?searchtype=author&amp;query=Archer%2C+D+B">Derek B. Archer</a>, <a href="/search/cs?searchtype=author&amp;query=Hohman%2C+T+J">Timothy J. Hohman</a>, <a href="/search/cs?searchtype=author&amp;query=Landman%2C+B+A">Bennett A. Landman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13846v1-abstract-short" style="display: inline;"> An incomplete field-of-view (FOV) in diffusion magnetic resonance imaging (dMRI) can severely hinder the volumetric and bundle analyses of whole-brain white matter connectivity. Although existing works have investigated imputing the missing regions using deep generative models, it remains unclear how to specifically utilize additional information from paired multi-modality data and whether this ca&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13846v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13846v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13846v1-abstract-full" style="display: none;"> An incomplete field-of-view (FOV) in diffusion magnetic resonance imaging (dMRI) can severely hinder the volumetric and bundle analyses of whole-brain white matter connectivity. Although existing works have investigated imputing the missing regions using deep generative models, it remains unclear how to specifically utilize additional information from paired multi-modality data and whether this can enhance the imputation quality and be useful for downstream tractography. To fill this gap, we propose a novel framework for imputing dMRI scans in the incomplete part of the FOV by integrating the learned diffusion features in the acquired part of the FOV to the complete brain anatomical structure. We hypothesize that by this design the proposed framework can enhance the imputation performance of the dMRI scans and therefore be useful for repairing whole-brain tractography in corrupted dMRI scans with incomplete FOV. We tested our framework on two cohorts from different sites with a total of 96 subjects and compared it with a baseline imputation method that treats the information from T1w and dMRI scans equally. The proposed framework achieved significant improvements in imputation performance, as demonstrated by angular correlation coefficient (p &lt; 1E-5), and in downstream tractography accuracy, as demonstrated by Dice score (p &lt; 0.01). Results suggest that the proposed framework improved imputation performance in dMRI scans by specifically utilizing additional information from paired multi-modality data, compared with the baseline method. The imputation achieved by the proposed framework enhances whole brain tractography, and therefore reduces the uncertainty when analyzing bundles associated with neurodegenerative. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13846v1-abstract-full').style.display = 'none'; document.getElementById('2409.13846v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages; 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08260">arXiv:2409.08260</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.08260">pdf</a>, <a href="https://arxiv.org/format/2409.08260">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Improving Text-guided Object Inpainting with Semantic Pre-inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jingwen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yehao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhineng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08260v1-abstract-short" style="display: inline;"> Recent years have witnessed the success of large text-to-image diffusion models and their remarkable potential to generate high-quality images. The further pursuit of enhancing the editability of images has sparked significant interest in the downstream task of inpainting a novel object described by a text prompt within a designated region in the image. Nevertheless, the problem is not trivial fro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08260v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08260v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08260v1-abstract-full" style="display: none;"> Recent years have witnessed the success of large text-to-image diffusion models and their remarkable potential to generate high-quality images. The further pursuit of enhancing the editability of images has sparked significant interest in the downstream task of inpainting a novel object described by a text prompt within a designated region in the image. Nevertheless, the problem is not trivial from two aspects: 1) Solely relying on one single U-Net to align text prompt and visual object across all the denoising timesteps is insufficient to generate desired objects; 2) The controllability of object generation is not guaranteed in the intricate sampling space of diffusion model. In this paper, we propose to decompose the typical single-stage object inpainting into two cascaded processes: 1) semantic pre-inpainting that infers the semantic features of desired objects in a multi-modal feature space; 2) high-fieldity object generation in diffusion latent space that pivots on such inpainted semantic features. To achieve this, we cascade a Transformer-based semantic inpainter and an object inpainting diffusion model, leading to a novel CAscaded Transformer-Diffusion (CAT-Diffusion) framework for text-guided object inpainting. Technically, the semantic inpainter is trained to predict the semantic features of the target object conditioning on unmasked context and text prompt. The outputs of the semantic inpainter then act as the informative visual prompts to guide high-fieldity object generation through a reference adapter layer, leading to controllable object inpainting. Extensive evaluations on OpenImages-V6 and MSCOCO validate the superiority of CAT-Diffusion against the state-of-the-art methods. Code is available at \url{https://github.com/Nnn-s/CATdiffusion}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08260v1-abstract-full').style.display = 'none'; document.getElementById('2409.08260v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Source code is available at https://github.com/Nnn-s/CATdiffusion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08258">arXiv:2409.08258</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.08258">pdf</a>, <a href="https://arxiv.org/format/2409.08258">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Improving Virtual Try-On with Garment-focused Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+S">Siqi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yehao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jingwen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08258v1-abstract-short" style="display: inline;"> Diffusion models have led to the revolutionizing of generative modeling in numerous image synthesis tasks. Nevertheless, it is not trivial to directly apply diffusion models for synthesizing an image of a target person wearing a given in-shop garment, i.e., image-based virtual try-on (VTON) task. The difficulty originates from the aspect that the diffusion process should not only produce holistica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08258v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08258v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08258v1-abstract-full" style="display: none;"> Diffusion models have led to the revolutionizing of generative modeling in numerous image synthesis tasks. Nevertheless, it is not trivial to directly apply diffusion models for synthesizing an image of a target person wearing a given in-shop garment, i.e., image-based virtual try-on (VTON) task. The difficulty originates from the aspect that the diffusion process should not only produce holistically high-fidelity photorealistic image of the target person, but also locally preserve every appearance and texture detail of the given garment. To address this, we shape a new Diffusion model, namely GarDiff, which triggers the garment-focused diffusion process with amplified guidance of both basic visual appearance and detailed textures (i.e., high-frequency details) derived from the given garment. GarDiff first remoulds a pre-trained latent diffusion model with additional appearance priors derived from the CLIP and VAE encodings of the reference garment. Meanwhile, a novel garment-focused adapter is integrated into the UNet of diffusion model, pursuing local fine-grained alignment with the visual appearance of reference garment and human pose. We specifically design an appearance loss over the synthesized garment to enhance the crucial, high-frequency details. Extensive experiments on VITON-HD and DressCode datasets demonstrate the superiority of our GarDiff when compared to state-of-the-art VTON approaches. Code is publicly available at: \href{https://github.com/siqi0905/GarDiff/tree/master}{https://github.com/siqi0905/GarDiff/tree/master}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08258v1-abstract-full').style.display = 'none'; document.getElementById('2409.08258v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Source code is available at https://github.com/siqi0905/GarDiff/tree/master</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07454">arXiv:2409.07454</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07454">pdf</a>, <a href="https://arxiv.org/format/2409.07454">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> DreamMesh: Jointly Manipulating and Texturing Triangle Meshes for Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haibo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhineng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07454v1-abstract-short" style="display: inline;"> Learning radiance fields (NeRF) with powerful 2D diffusion models has garnered popularity for text-to-3D generation. Nevertheless, the implicit 3D representations of NeRF lack explicit modeling of meshes and textures over surfaces, and such surface-undefined way may suffer from the issues, e.g., noisy surfaces with ambiguous texture details or cross-view inconsistency. To alleviate this, we presen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07454v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07454v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07454v1-abstract-full" style="display: none;"> Learning radiance fields (NeRF) with powerful 2D diffusion models has garnered popularity for text-to-3D generation. Nevertheless, the implicit 3D representations of NeRF lack explicit modeling of meshes and textures over surfaces, and such surface-undefined way may suffer from the issues, e.g., noisy surfaces with ambiguous texture details or cross-view inconsistency. To alleviate this, we present DreamMesh, a novel text-to-3D architecture that pivots on well-defined surfaces (triangle meshes) to generate high-fidelity explicit 3D model. Technically, DreamMesh capitalizes on a distinctive coarse-to-fine scheme. In the coarse stage, the mesh is first deformed by text-guided Jacobians and then DreamMesh textures the mesh with an interlaced use of 2D diffusion models in a tuning free manner from multiple viewpoints. In the fine stage, DreamMesh jointly manipulates the mesh and refines the texture map, leading to high-quality triangle meshes with high-fidelity textured materials. Extensive experiments demonstrate that DreamMesh significantly outperforms state-of-the-art text-to-3D methods in faithfully generating 3D content with richer textual details and enhanced geometry. Our project page is available at https://dreammesh.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07454v1-abstract-full').style.display = 'none'; document.getElementById('2409.07454v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Project page is available at \url{https://dreammesh.github.io}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07452">arXiv:2409.07452</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07452">pdf</a>, <a href="https://arxiv.org/format/2409.07452">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Hi3D: Pursuing High-Resolution Image-to-3D Generation with Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haibo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhineng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ngo%2C+C">Chong-Wah Ngo</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07452v1-abstract-short" style="display: inline;"> Despite having tremendous progress in image-to-3D generation, existing methods still struggle to produce multi-view consistent images with high-resolution textures in detail, especially in the paradigm of 2D diffusion that lacks 3D awareness. In this work, we present High-resolution Image-to-3D model (Hi3D), a new video diffusion based paradigm that redefines a single image to multi-view images as&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07452v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07452v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07452v1-abstract-full" style="display: none;"> Despite having tremendous progress in image-to-3D generation, existing methods still struggle to produce multi-view consistent images with high-resolution textures in detail, especially in the paradigm of 2D diffusion that lacks 3D awareness. In this work, we present High-resolution Image-to-3D model (Hi3D), a new video diffusion based paradigm that redefines a single image to multi-view images as 3D-aware sequential image generation (i.e., orbital video generation). This methodology delves into the underlying temporal consistency knowledge in video diffusion model that generalizes well to geometry consistency across multiple views in 3D generation. Technically, Hi3D first empowers the pre-trained video diffusion model with 3D-aware prior (camera pose condition), yielding multi-view images with low-resolution texture details. A 3D-aware video-to-video refiner is learnt to further scale up the multi-view images with high-resolution texture details. Such high-resolution multi-view images are further augmented with novel views through 3D Gaussian Splatting, which are finally leveraged to obtain high-fidelity meshes via 3D reconstruction. Extensive experiments on both novel view synthesis and single view reconstruction demonstrate that our Hi3D manages to produce superior multi-view consistency images with highly-detailed textures. Source code and data are available at \url{https://github.com/yanghb22-fdu/Hi3D-Official}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07452v1-abstract-full').style.display = 'none'; document.getElementById('2409.07452v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Multimedia 2024. Source code is available at \url{https://github.com/yanghb22-fdu/Hi3D-Official}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07451">arXiv:2409.07451</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07451">pdf</a>, <a href="https://arxiv.org/format/2409.07451">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> FreeEnhance: Tuning-Free Image Enhancement via Content-Consistent Noising-and-Denoising Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yiheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Z">Zhaofan Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhineng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07451v1-abstract-short" style="display: inline;"> The emergence of text-to-image generation models has led to the recognition that image enhancement, performed as post-processing, would significantly improve the visual quality of the generated images. Exploring diffusion models to enhance the generated images nevertheless is not trivial and necessitates to delicately enrich plentiful details while preserving the visual appearance of key content i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07451v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07451v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07451v1-abstract-full" style="display: none;"> The emergence of text-to-image generation models has led to the recognition that image enhancement, performed as post-processing, would significantly improve the visual quality of the generated images. Exploring diffusion models to enhance the generated images nevertheless is not trivial and necessitates to delicately enrich plentiful details while preserving the visual appearance of key content in the original image. In this paper, we propose a novel framework, namely FreeEnhance, for content-consistent image enhancement using the off-the-shelf image diffusion models. Technically, FreeEnhance is a two-stage process that firstly adds random noise to the input image and then capitalizes on a pre-trained image diffusion model (i.e., Latent Diffusion Models) to denoise and enhance the image details. In the noising stage, FreeEnhance is devised to add lighter noise to the region with higher frequency to preserve the high-frequent patterns (e.g., edge, corner) in the original image. In the denoising stage, we present three target properties as constraints to regularize the predicted noise, enhancing images with high acutance and high visual quality. Extensive experiments conducted on the HPDv2 dataset demonstrate that our FreeEnhance outperforms the state-of-the-art image enhancement models in terms of quantitative metrics and human preference. More remarkably, FreeEnhance also shows higher human preference compared to the commercial image enhancement solution of Magnific AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07451v1-abstract-full').style.display = 'none'; document.getElementById('2409.07451v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Multimedia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02664">arXiv:2409.02664</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02664">pdf</a>, <a href="https://arxiv.org/format/2409.02664">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Standing on the Shoulders of Giants: Reprogramming Visual-Language Model for General Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+K">Kaiqing Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yuzhen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Weixiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02664v1-abstract-short" style="display: inline;"> The proliferation of deepfake faces poses huge potential negative impacts on our daily lives. Despite substantial advancements in deepfake detection over these years, the generalizability of existing methods against forgeries from unseen datasets or created by emerging generative models remains constrained. In this paper, inspired by the zero-shot advantages of Vision-Language Models (VLMs), we pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02664v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02664v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02664v1-abstract-full" style="display: none;"> The proliferation of deepfake faces poses huge potential negative impacts on our daily lives. Despite substantial advancements in deepfake detection over these years, the generalizability of existing methods against forgeries from unseen datasets or created by emerging generative models remains constrained. In this paper, inspired by the zero-shot advantages of Vision-Language Models (VLMs), we propose a novel approach that repurposes a well-trained VLM for general deepfake detection. Motivated by the model reprogramming paradigm that manipulates the model prediction via data perturbations, our method can reprogram a pretrained VLM model (e.g., CLIP) solely based on manipulating its input without tuning the inner parameters. Furthermore, we insert a pseudo-word guided by facial identity into the text prompt. Extensive experiments on several popular benchmarks demonstrate that (1) the cross-dataset and cross-manipulation performances of deepfake detection can be significantly and consistently improved (e.g., over 88% AUC in cross-dataset setting from FF++ to WildDeepfake) using a pre-trained CLIP model with our proposed reprogramming method; (2) our superior performances are at less cost of trainable parameters, making it a promising approach for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02664v1-abstract-full').style.display = 'none'; document.getElementById('2409.02664v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17065">arXiv:2408.17065</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.17065">pdf</a>, <a href="https://arxiv.org/format/2408.17065">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalizing Deepfake Video Detection with Plug-and-Play: Video-Level Blending and Spatiotemporal Adapter Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yandan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+X">Xinghe Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17065v1-abstract-short" style="display: inline;"> Three key challenges hinder the development of current deepfake video detection: (1) Temporal features can be complex and diverse: how can we identify general temporal artifacts to enhance model generalization? (2) Spatiotemporal models often lean heavily on one type of artifact and ignore the other: how can we ensure balanced learning from both? (3) Videos are naturally resource-intensive: how ca&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17065v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17065v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17065v1-abstract-full" style="display: none;"> Three key challenges hinder the development of current deepfake video detection: (1) Temporal features can be complex and diverse: how can we identify general temporal artifacts to enhance model generalization? (2) Spatiotemporal models often lean heavily on one type of artifact and ignore the other: how can we ensure balanced learning from both? (3) Videos are naturally resource-intensive: how can we tackle efficiency without compromising accuracy? This paper attempts to tackle the three challenges jointly. First, inspired by the notable generality of using image-level blending data for image forgery detection, we investigate whether and how video-level blending can be effective in video. We then perform a thorough analysis and identify a previously underexplored temporal forgery artifact: Facial Feature Drift (FFD), which commonly exists across different forgeries. To reproduce FFD, we then propose a novel Video-level Blending data (VB), where VB is implemented by blending the original image and its warped version frame-by-frame, serving as a hard negative sample to mine more general artifacts. Second, we carefully design a lightweight Spatiotemporal Adapter (StA) to equip a pretrained image model (both ViTs and CNNs) with the ability to capture both spatial and temporal features jointly and efficiently. StA is designed with two-stream 3D-Conv with varying kernel sizes, allowing it to process spatial and temporal features separately. Extensive experiments validate the effectiveness of the proposed methods; and show our approach can generalize well to previously unseen forgery videos, even the just-released (in 2024) SoTAs. We release our code and pretrained weights at \url{https://github.com/YZY-stack/StA4Deepfake}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17065v1-abstract-full').style.display = 'none'; document.getElementById('2408.17065v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15558">arXiv:2408.15558</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.15558">pdf</a>, <a href="https://arxiv.org/ps/2408.15558">ps</a>, <a href="https://arxiv.org/format/2408.15558">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> New quantum codes from constacyclic codes over finite chain rings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yongsheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Heqian Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Kai%2C+X">Xiaoshan Kai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15558v1-abstract-short" style="display: inline;"> Let $R$ be the finite chain ring $\mathbb{F}_{p^{2m}}+{u}\mathbb{F}_{p^{2m}}$, where $\mathbb{F}_{p^{2m}}$ is the finite field with $p^{2m}$ elements, $p$ is a prime, $m$ is a non-negative integer and ${u}^{2}=0.$ In this paper, we firstly define a class of Gray maps, which changes the Hermitian self-orthogonal property of linear codes over $\mathbb{F}_{2^{2m}}+{u}\mathbb{F}_{2^{2m}}$ into the H&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15558v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15558v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15558v1-abstract-full" style="display: none;"> Let $R$ be the finite chain ring $\mathbb{F}_{p^{2m}}+{u}\mathbb{F}_{p^{2m}}$, where $\mathbb{F}_{p^{2m}}$ is the finite field with $p^{2m}$ elements, $p$ is a prime, $m$ is a non-negative integer and ${u}^{2}=0.$ In this paper, we firstly define a class of Gray maps, which changes the Hermitian self-orthogonal property of linear codes over $\mathbb{F}_{2^{2m}}+{u}\mathbb{F}_{2^{2m}}$ into the Hermitian self-orthogonal property of linear codes over $\mathbb{F}_{2^{2m}}$. Applying the Hermitian construction, a new class of $2^{m}$-ary quantum codes are obtained from Hermitian constacyclic self-orthogonal codes over $\mathbb{F}_{2^{2m}}+{u}\mathbb{F}_{2^{2m}}.$ We secondly define another class of maps, which changes the Hermitian self-orthogonal property of linear codes over $R$ into the trace self-orthogonal property of linear codes over $\mathbb{F}_{p^{2m}}$. Using the Symplectic construction, a new class of $p^{m}$-ary quantum codes are obtained from Hermitian constacyclic self-orthogonal codes over $R.$ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15558v1-abstract-full').style.display = 'none'; document.getElementById('2408.15558v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08567">arXiv:2408.08567</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.08567">pdf</a>, <a href="https://arxiv.org/format/2408.08567">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JSTSP.2024.3446173">10.1109/JSTSP.2024.3446173 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton Sketching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xue Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jianqing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jialin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+K">Kun Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tao Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+W">Wotao Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+R">Rong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+H">HanQin Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08567v3-abstract-short" style="display: inline;"> Attention based models have achieved many remarkable breakthroughs in numerous applications. However, the quadratic complexity of Attention makes the vanilla Attention based models hard to apply to long sequence tasks. Various improved Attention structures are proposed to reduce the computation cost by inducing low rankness and approximating the whole sequence by sub-sequences. The most challengin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08567v3-abstract-full').style.display = 'inline'; document.getElementById('2408.08567v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08567v3-abstract-full" style="display: none;"> Attention based models have achieved many remarkable breakthroughs in numerous applications. However, the quadratic complexity of Attention makes the vanilla Attention based models hard to apply to long sequence tasks. Various improved Attention structures are proposed to reduce the computation cost by inducing low rankness and approximating the whole sequence by sub-sequences. The most challenging part of those approaches is maintaining the proper balance between information preservation and computation reduction: the longer sub-sequences used, the better information is preserved, but at the price of introducing more noise and computational costs. In this paper, we propose a smoothed skeleton sketching based Attention structure, coined S$^3$Attention, which significantly improves upon the previous attempts to negotiate this trade-off. S$^3$Attention has two mechanisms to effectively minimize the impact of noise while keeping the linear complexity to the sequence length: a smoothing block to mix information over long sequences and a matrix sketching method that simultaneously selects columns and rows from the input matrix. We verify the effectiveness of S$^3$Attention both theoretically and empirically. Extensive studies over Long Range Arena (LRA) datasets and six time-series forecasting show that S$^3$Attention significantly outperforms both vanilla Attention and other state-of-the-art variants of Attention structures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08567v3-abstract-full').style.display = 'none'; document.getElementById('2408.08567v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06381">arXiv:2408.06381</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.06381">pdf</a>, <a href="https://arxiv.org/format/2408.06381">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Assessment of Cell Nuclei AI Foundation Models in Kidney Pathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Junlin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+S">Siqi Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+Z">Zhewen Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yizhe Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lionts%2C+M">Marilyn Lionts</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+C">Catie Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Wilkes%2C+M">Mitchell Wilkes</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06381v1-abstract-short" style="display: inline;"> Cell nuclei instance segmentation is a crucial task in digital kidney pathology. Traditional automatic segmentation methods often lack generalizability when applied to unseen datasets. Recently, the success of foundation models (FMs) has provided a more generalizable solution, potentially enabling the segmentation of any cell type. In this study, we perform a large-scale evaluation of three widely&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06381v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06381v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06381v1-abstract-full" style="display: none;"> Cell nuclei instance segmentation is a crucial task in digital kidney pathology. Traditional automatic segmentation methods often lack generalizability when applied to unseen datasets. Recently, the success of foundation models (FMs) has provided a more generalizable solution, potentially enabling the segmentation of any cell type. In this study, we perform a large-scale evaluation of three widely used state-of-the-art (SOTA) cell nuclei foundation models (Cellpose, StarDist, and CellViT). Specifically, we created a highly diverse evaluation dataset consisting of 2,542 kidney whole slide images (WSIs) collected from both human and rodent sources, encompassing various tissue types, sizes, and staining methods. To our knowledge, this is the largest-scale evaluation of its kind to date. Our quantitative analysis of the prediction distribution reveals a persistent performance gap in kidney pathology. Among the evaluated models, CellViT demonstrated superior performance in segmenting nuclei in kidney pathology. However, none of the foundation models are perfect; a performance gap remains in general nuclei segmentation for kidney pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06381v1-abstract-full').style.display = 'none'; document.getElementById('2408.06381v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18390">arXiv:2407.18390</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.18390">pdf</a>, <a href="https://arxiv.org/format/2407.18390">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adapting Mouse Pathological Model to Human Glomerular Lesion Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lining Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaohong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18390v1-abstract-short" style="display: inline;"> Moving from animal models to human applications in preclinical research encompasses a broad spectrum of disciplines in medical science. A fundamental element in the development of new drugs, treatments, diagnostic methods, and in deepening our understanding of disease processes is the accurate measurement of kidney tissues. Past studies have demonstrated the viability of translating glomeruli segm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18390v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18390v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18390v1-abstract-full" style="display: none;"> Moving from animal models to human applications in preclinical research encompasses a broad spectrum of disciplines in medical science. A fundamental element in the development of new drugs, treatments, diagnostic methods, and in deepening our understanding of disease processes is the accurate measurement of kidney tissues. Past studies have demonstrated the viability of translating glomeruli segmentation techniques from mouse models to human applications. Yet, these investigations tend to neglect the complexities involved in segmenting pathological glomeruli affected by different lesions. Such lesions present a wider range of morphological variations compared to healthy glomerular tissue, which are arguably more valuable than normal glomeruli in clinical practice. Furthermore, data on lesions from animal models can be more readily scaled up from disease models and whole kidney biopsies. This brings up a question: ``\textit{Can a pathological segmentation model trained on mouse models be effectively applied to human patients?}&#34; To answer this question, we introduced GLAM, a deep learning study for fine-grained segmentation of human kidney lesions using a mouse model, addressing mouse-to-human transfer learning, by evaluating different learning strategies for segmenting human pathological lesions using zero-shot transfer learning and hybrid learning by leveraging mouse samples. From the results, the hybrid learning model achieved superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18390v1-abstract-full').style.display = 'none'; document.getElementById('2407.18390v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14429">arXiv:2407.14429</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.14429">pdf</a>, <a href="https://arxiv.org/format/2407.14429">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dataset Distillation in Medical Imaging: A Feasibility Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Muyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Lionts%2C+M">Marilyn Lionts</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14429v1-abstract-short" style="display: inline;"> Data sharing in the medical image analysis field has potential yet remains underappreciated. The aim is often to share datasets efficiently with other sites to train models effectively. One possible solution is to avoid transferring the entire dataset while still achieving similar model performance. Recent progress in data distillation within computer science offers promising prospects for sharing&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14429v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14429v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14429v1-abstract-full" style="display: none;"> Data sharing in the medical image analysis field has potential yet remains underappreciated. The aim is often to share datasets efficiently with other sites to train models effectively. One possible solution is to avoid transferring the entire dataset while still achieving similar model performance. Recent progress in data distillation within computer science offers promising prospects for sharing medical data efficiently without significantly compromising model effectiveness. However, it remains uncertain whether these methods would be applicable to medical imaging, since medical and natural images are distinct fields. Moreover, it is intriguing to consider what level of performance could be achieved with these methods. To answer these questions, we conduct investigations on a variety of leading data distillation methods, in different contexts of medical imaging. We evaluate the feasibility of these methods with extensive experiments in two aspects: 1) Assess the impact of data distillation across multiple datasets characterized by minor or great variations. 2) Explore the indicator to predict the distillation performance. Our extensive experiments across multiple medical datasets reveal that data distillation can significantly reduce dataset size while maintaining comparable model performance to that achieved with the full dataset, suggesting that a small, representative sample of images can serve as a reliable indicator of distillation success. This study demonstrates that data distillation is a viable method for efficient and secure medical data sharing, with the potential to facilitate enhanced collaborative research and clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14429v1-abstract-full').style.display = 'none'; document.getElementById('2407.14429v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11083">arXiv:2407.11083</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11083">pdf</a>, <a href="https://arxiv.org/format/2407.11083">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Empowering Graph Invariance Learning with Deep Spurious Infomax </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianjun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yongqiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhenhao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11083v1-abstract-short" style="display: inline;"> Recently, there has been a surge of interest in developing graph neural networks that utilize the invariance principle on graphs to generalize the out-of-distribution (OOD) data. Due to the limited knowledge about OOD data, existing approaches often pose assumptions about the correlation strengths of the underlying spurious features and the target labels. However, this prior is often unavailable a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11083v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11083v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11083v1-abstract-full" style="display: none;"> Recently, there has been a surge of interest in developing graph neural networks that utilize the invariance principle on graphs to generalize the out-of-distribution (OOD) data. Due to the limited knowledge about OOD data, existing approaches often pose assumptions about the correlation strengths of the underlying spurious features and the target labels. However, this prior is often unavailable and will change arbitrarily in the real-world scenarios, which may lead to severe failures of the existing graph invariance learning methods. To bridge this gap, we introduce a novel graph invariance learning paradigm, which induces a robust and general inductive bias. The paradigm is built upon the observation that the infomax principle encourages learning spurious features regardless of spurious correlation strengths. We further propose the EQuAD framework that realizes this learning paradigm and employs tailored learning objectives that provably elicit invariant features by disentangling them from the spurious features learned through infomax. Notably, EQuAD shows stable and enhanced performance across different degrees of bias in synthetic datasets and challenging real-world datasets up to $31.76\%$. Our code is available at \url{https://github.com/tianyao-aka/EQuAD}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11083v1-abstract-full').style.display = 'none'; document.getElementById('2407.11083v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML2024 camera-ready version</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09979">arXiv:2407.09979</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.09979">pdf</a>, <a href="https://arxiv.org/format/2407.09979">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PFPs: Prompt-guided Flexible Pathological Segmentation for Diverse Potential Outcomes Using Large Vision and Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Junlin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09979v1-abstract-short" style="display: inline;"> The Vision Foundation Model has recently gained attention in medical image analysis. Its zero-shot learning capabilities accelerate AI deployment and enhance the generalizability of clinical applications. However, segmenting pathological images presents a special focus on the flexibility of segmentation targets. For instance, a single click on a Whole Slide Image (WSI) could signify a cell, a func&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09979v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09979v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09979v1-abstract-full" style="display: none;"> The Vision Foundation Model has recently gained attention in medical image analysis. Its zero-shot learning capabilities accelerate AI deployment and enhance the generalizability of clinical applications. However, segmenting pathological images presents a special focus on the flexibility of segmentation targets. For instance, a single click on a Whole Slide Image (WSI) could signify a cell, a functional unit, or layers, adding layers of complexity to the segmentation tasks. Current models primarily predict potential outcomes but lack the flexibility needed for physician input. In this paper, we explore the potential of enhancing segmentation model flexibility by introducing various task prompts through a Large Language Model (LLM) alongside traditional task tokens. Our contribution is in four-fold: (1) we construct a computational-efficient pipeline that uses finetuned language prompts to guide flexible multi-class segmentation; (2) We compare segmentation performance with fixed prompts against free-text; (3) We design a multi-task kidney pathology segmentation dataset and the corresponding various free-text prompts; and (4) We evaluate our approach on the kidney pathology dataset, assessing its capacity to new cases during inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09979v1-abstract-full').style.display = 'none'; document.getElementById('2407.09979v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03307">arXiv:2407.03307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.03307">pdf</a>, <a href="https://arxiv.org/format/2407.03307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HoloHisto: End-to-end Gigapixel WSI Segmentation with 4K Resolution Sequential Tokenization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yufan He</a>, <a href="/search/cs?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+P">Pengfeig Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Roth%2C+H">Holger Roth</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03307v1-abstract-short" style="display: inline;"> In digital pathology, the traditional method for deep learning-based image segmentation typically involves a two-stage process: initially segmenting high-resolution whole slide images (WSI) into smaller patches (e.g., 256x256, 512x512, 1024x1024) and subsequently reconstructing them to their original scale. This method often struggles to capture the complex details and vast scope of WSIs. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03307v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03307v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03307v1-abstract-full" style="display: none;"> In digital pathology, the traditional method for deep learning-based image segmentation typically involves a two-stage process: initially segmenting high-resolution whole slide images (WSI) into smaller patches (e.g., 256x256, 512x512, 1024x1024) and subsequently reconstructing them to their original scale. This method often struggles to capture the complex details and vast scope of WSIs. In this paper, we propose the holistic histopathology (HoloHisto) segmentation method to achieve end-to-end segmentation on gigapixel WSIs, whose maximum resolution is above 80,000$\times$70,000 pixels. HoloHisto fundamentally shifts the paradigm of WSI segmentation to an end-to-end learning fashion with 1) a large (4K) resolution base patch for elevated visual information inclusion and efficient processing, and 2) a novel sequential tokenization mechanism to properly model the contextual relationships and efficiently model the rich information from the 4K input. To our best knowledge, HoloHisto presents the first holistic approach for gigapixel resolution WSI segmentation, supporting direct I/O of complete WSI and their corresponding gigapixel masks. Under the HoloHisto platform, we unveil a random 4K sampler that transcends ultra-high resolution, delivering 31 and 10 times more pixels than standard 2D and 3D patches, respectively, for advancing computational capabilities. To facilitate efficient 4K resolution dense prediction, we leverage sequential tokenization, utilizing a pre-trained image tokenizer to group image features into a discrete token grid. To assess the performance, our team curated a new kidney pathology image segmentation (KPIs) dataset with WSI-level glomeruli segmentation from whole mouse kidneys. From the results, HoloHisto-4K delivers remarkable performance gains over previous state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03307v1-abstract-full').style.display = 'none'; document.getElementById('2407.03307v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00596">arXiv:2407.00596</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.00596">pdf</a>, <a href="https://arxiv.org/format/2407.00596">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HATs: Hierarchical Adaptive Taxonomy Segmentation for Panoramic Pathology Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+S">Shunxing Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00596v1-abstract-short" style="display: inline;"> Panoramic image segmentation in computational pathology presents a remarkable challenge due to the morphologically complex and variably scaled anatomy. For instance, the intricate organization in kidney pathology spans multiple layers, from regions like the cortex and medulla to functional units such as glomeruli, tubules, and vessels, down to various cell types. In this paper, we propose a novel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00596v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00596v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00596v1-abstract-full" style="display: none;"> Panoramic image segmentation in computational pathology presents a remarkable challenge due to the morphologically complex and variably scaled anatomy. For instance, the intricate organization in kidney pathology spans multiple layers, from regions like the cortex and medulla to functional units such as glomeruli, tubules, and vessels, down to various cell types. In this paper, we propose a novel Hierarchical Adaptive Taxonomy Segmentation (HATs) method, which is designed to thoroughly segment panoramic views of kidney structures by leveraging detailed anatomical insights. Our approach entails (1) the innovative HATs technique which translates spatial relationships among 15 distinct object classes into a versatile &#34;plug-and-play&#34; loss function that spans across regions, functional units, and cells, (2) the incorporation of anatomical hierarchies and scale considerations into a unified simple matrix representation for all panoramic entities, (3) the adoption of the latest AI foundation model (EfficientSAM) as a feature extraction tool to boost the model&#39;s adaptability, yet eliminating the need for manual prompt generation in conventional segment anything model (SAM). Experimental findings demonstrate that the HATs method offers an efficient and effective strategy for integrating clinical insights and imaging precedents into a unified segmentation model across more than 15 categories. The official implementation is publicly available at https://github.com/hrlblab/HATs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00596v1-abstract-full').style.display = 'none'; document.getElementById('2407.00596v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2402.19286</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00247">arXiv:2407.00247</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.00247">pdf</a>, <a href="https://arxiv.org/format/2407.00247">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt Refinement with Image Pivot for Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+J">Jingtao Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Ai%2C+Q">Qingyao Ai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+J">Jiaxin Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+S">Shaoping Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00247v1-abstract-short" style="display: inline;"> For text-to-image generation, automatically refining user-provided natural language prompts into the keyword-enriched prompts favored by systems is essential for the user experience. Such a prompt refinement process is analogous to translating the prompt from &#34;user languages&#34; into &#34;system languages&#34;. However, the scarcity of such parallel corpora makes it difficult to train a prompt refinement mod&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00247v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00247v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00247v1-abstract-full" style="display: none;"> For text-to-image generation, automatically refining user-provided natural language prompts into the keyword-enriched prompts favored by systems is essential for the user experience. Such a prompt refinement process is analogous to translating the prompt from &#34;user languages&#34; into &#34;system languages&#34;. However, the scarcity of such parallel corpora makes it difficult to train a prompt refinement model. Inspired by zero-shot machine translation techniques, we introduce Prompt Refinement with Image Pivot (PRIP). PRIP innovatively uses the latent representation of a user-preferred image as an intermediary &#34;pivot&#34; between the user and system languages. It decomposes the refinement process into two data-rich tasks: inferring representations of user-preferred images from user languages and subsequently translating image representations into system languages. Thus, it can leverage abundant data for training. Extensive experiments show that PRIP substantially outperforms a wide range of baselines and effectively transfers to unseen systems in a zero-shot manner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00247v1-abstract-full').style.display = 'none'; document.getElementById('2407.00247v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19832">arXiv:2406.19832</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19832">pdf</a>, <a href="https://arxiv.org/format/2406.19832">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3589334.3645542">10.1145/3589334.3645542 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MuGSI: Distilling GNNs with Multi-Granularity Structural Information for Graph Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianjun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jiaqi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+D">Defu Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guangyi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19832v1-abstract-short" style="display: inline;"> Recent works have introduced GNN-to-MLP knowledge distillation (KD) frameworks to combine both GNN&#39;s superior performance and MLP&#39;s fast inference speed. However, existing KD frameworks are primarily designed for node classification within single graphs, leaving their applicability to graph classification largely unexplored. Two main challenges arise when extending KD for node classification to gr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19832v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19832v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19832v1-abstract-full" style="display: none;"> Recent works have introduced GNN-to-MLP knowledge distillation (KD) frameworks to combine both GNN&#39;s superior performance and MLP&#39;s fast inference speed. However, existing KD frameworks are primarily designed for node classification within single graphs, leaving their applicability to graph classification largely unexplored. Two main challenges arise when extending KD for node classification to graph classification: (1) The inherent sparsity of learning signals due to soft labels being generated at the graph level; (2) The limited expressiveness of student MLPs, especially in datasets with limited input feature spaces. To overcome these challenges, we introduce MuGSI, a novel KD framework that employs Multi-granularity Structural Information for graph classification. Specifically, we propose multi-granularity distillation loss in MuGSI to tackle the first challenge. This loss function is composed of three distinct components: graph-level distillation, subgraph-level distillation, and node-level distillation. Each component targets a specific granularity of the graph structure, ensuring a comprehensive transfer of structural knowledge from the teacher model to the student model. To tackle the second challenge, MuGSI proposes to incorporate a node feature augmentation component, thereby enhancing the expressiveness of the student MLPs and making them more capable learners. We perform extensive experiments across a variety of datasets and different teacher/student model architectures. The experiment results demonstrate the effectiveness, efficiency, and robustness of MuGSI. Codes are publicly available at: \textbf{\url{https://github.com/tianyao-aka/MuGSI}.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19832v1-abstract-full').style.display = 'none'; document.getElementById('2406.19832v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures. Accepted by TheWebConf2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19540">arXiv:2406.19540</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19540">pdf</a>, <a href="https://arxiv.org/format/2406.19540">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Weighted Circle Fusion: Ensembling Circle Representation from Different Object Detection Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yue%2C+J">Jialin Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19540v1-abstract-short" style="display: inline;"> Recently, the use of circle representation has emerged as a method to improve the identification of spherical objects (such as glomeruli, cells, and nuclei) in medical imaging studies. In traditional bounding box-based object detection, combining results from multiple models improves accuracy, especially when real-time processing isn&#39;t crucial. Unfortunately, this widely adopted strategy is not re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19540v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19540v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19540v1-abstract-full" style="display: none;"> Recently, the use of circle representation has emerged as a method to improve the identification of spherical objects (such as glomeruli, cells, and nuclei) in medical imaging studies. In traditional bounding box-based object detection, combining results from multiple models improves accuracy, especially when real-time processing isn&#39;t crucial. Unfortunately, this widely adopted strategy is not readily available for combining circle representations. In this paper, we propose Weighted Circle Fusion (WCF), a simple approach for merging predictions from various circle detection models. Our method leverages confidence scores associated with each proposed bounding circle to generate averaged circles. Our method undergoes thorough evaluation on a proprietary dataset for glomerular detection in object detection within whole slide imaging (WSI). The findings reveal a performance gain of 5 %, respectively, compared to existing ensemble methods. Furthermore, the Weighted Circle Fusion technique not only improves the precision of object detection in medical images but also notably decreases false detections, presenting a promising direction for future research and application in pathological image analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19540v1-abstract-full').style.display = 'none'; document.getElementById('2406.19540v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19244">arXiv:2406.19244</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19244">pdf</a>, <a href="https://arxiv.org/format/2406.19244">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3580305.3599390">10.1145/3580305.3599390 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Improving the Expressiveness of $K$-hop Message-Passing GNNs by Injecting Contextualized Substructure Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianjun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yiongxu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+S">Shangsong Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19244v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have become the \textit{de facto} standard for representational learning in graphs, and have achieved state-of-the-art performance in many graph-related tasks; however, it has been shown that the expressive power of standard GNNs are equivalent maximally to 1-dimensional Weisfeiler-Lehman (1-WL) Test. Recently, there is a line of works aiming to enhance the expressive&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19244v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19244v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19244v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have become the \textit{de facto} standard for representational learning in graphs, and have achieved state-of-the-art performance in many graph-related tasks; however, it has been shown that the expressive power of standard GNNs are equivalent maximally to 1-dimensional Weisfeiler-Lehman (1-WL) Test. Recently, there is a line of works aiming to enhance the expressive power of graph neural networks. One line of such works aim at developing $K$-hop message-passing GNNs where node representation is updated by aggregating information from not only direct neighbors but all neighbors within $K$-hop of the node. Another line of works leverages subgraph information to enhance the expressive power which is proven to be strictly more powerful than 1-WL test. In this work, we discuss the limitation of $K$-hop message-passing GNNs and propose \textit{substructure encoding function} to uplift the expressive power of any $K$-hop message-passing GNN. We further inject contextualized substructure information to enhance the expressiveness of $K$-hop message-passing GNNs. Our method is provably more powerful than previous works on $K$-hop graph neural networks and 1-WL subgraph GNNs, which is a specific type of subgraph based GNN models, and not less powerful than 3-WL. Empirically, our proposed method set new state-of-the-art performance or achieves comparable performance for a variety of datasets. Our code is available at \url{https://github.com/tianyao-aka/Expresive_K_hop_GNNs}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19244v1-abstract-full').style.display = 'none'; document.getElementById('2406.19244v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, published in Research track of KDD2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13495">arXiv:2406.13495</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13495">pdf</a>, <a href="https://arxiv.org/format/2406.13495">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DF40: Toward Next-Generation Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yandan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+X">Xinghe Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Junwei Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yunsheng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13495v2-abstract-short" style="display: inline;"> We propose a new comprehensive benchmark to revolutionize the current deepfake detection field to the next generation. Predominantly, existing works identify top-notch detection algorithms and models by adhering to the common practice: training detectors on one specific dataset (e.g., FF++) and testing them on other prevalent deepfake datasets. This protocol is often regarded as a &#34;golden compass&#34;&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13495v2-abstract-full').style.display = 'inline'; document.getElementById('2406.13495v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13495v2-abstract-full" style="display: none;"> We propose a new comprehensive benchmark to revolutionize the current deepfake detection field to the next generation. Predominantly, existing works identify top-notch detection algorithms and models by adhering to the common practice: training detectors on one specific dataset (e.g., FF++) and testing them on other prevalent deepfake datasets. This protocol is often regarded as a &#34;golden compass&#34; for navigating SoTA detectors. But can these stand-out &#34;winners&#34; be truly applied to tackle the myriad of realistic and diverse deepfakes lurking in the real world? If not, what underlying factors contribute to this gap? In this work, we found the dataset (both train and test) can be the &#34;primary culprit&#34; due to: (1) forgery diversity: Deepfake techniques are commonly referred to as both face forgery and entire image synthesis. Most existing datasets only contain partial types of them, with limited forgery methods implemented; (2) forgery realism: The dominated training dataset, FF++, contains out-of-date forgery techniques from the past four years. &#34;Honing skills&#34; on these forgeries makes it difficult to guarantee effective detection generalization toward nowadays&#39; SoTA deepfakes; (3) evaluation protocol: Most detection works perform evaluations on one type, which hinders the development of universal deepfake detectors. To address this dilemma, we construct a highly diverse deepfake detection dataset called DF40, which comprises 40 distinct deepfake techniques. We then conduct comprehensive evaluations using 4 standard evaluation protocols and 8 representative detection methods, resulting in over 2,000 evaluations. Through these evaluations, we provide an extensive analysis from various perspectives, leading to 7 new insightful findings. We also open up 4 valuable yet previously underexplored research questions to inspire future works. Our project page is https://github.com/YZY-stack/DF40. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13495v2-abstract-full').style.display = 'none'; document.getElementById('2406.13495v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2108.05080 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01884">arXiv:2406.01884</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.01884">pdf</a>, <a href="https://arxiv.org/format/2406.01884">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rank-based No-reference Quality Assessment for Face Swapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xinghui Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Wenbo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+T">Tianyi Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01884v1-abstract-short" style="display: inline;"> Face swapping has become a prominent research area in computer vision and image processing due to rapid technological advancements. The metric of measuring the quality in most face swapping methods relies on several distances between the manipulated images and the source image, or the target image, i.e., there are suitable known reference face images. Therefore, there is still a gap in accurately&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01884v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01884v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01884v1-abstract-full" style="display: none;"> Face swapping has become a prominent research area in computer vision and image processing due to rapid technological advancements. The metric of measuring the quality in most face swapping methods relies on several distances between the manipulated images and the source image, or the target image, i.e., there are suitable known reference face images. Therefore, there is still a gap in accurately assessing the quality of face interchange in reference-free scenarios. In this study, we present a novel no-reference image quality assessment (NR-IQA) method specifically designed for face swapping, addressing this issue by constructing a comprehensive large-scale dataset, implementing a method for ranking image quality based on multiple facial attributes, and incorporating a Siamese network based on interpretable qualitative comparisons. Our model demonstrates the state-of-the-art performance in the quality assessment of swapped faces, providing coarse- and fine-grained. Enhanced by this metric, an improved face-swapping model achieved a more advanced level with respect to expressions and poses. Extensive experiments confirm the superiority of our method over existing general no-reference image quality assessment metrics and the latest metric of facial image quality assessment, making it well suited for evaluating face swapping images in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01884v1-abstract-full').style.display = 'none'; document.getElementById('2406.01884v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17824">arXiv:2405.17824</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.17824">pdf</a>, <a href="https://arxiv.org/format/2405.17824">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> mTREE: Multi-Level Text-Guided Representation End-to-End Learning for Whole Slide Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17824v1-abstract-short" style="display: inline;"> Multi-modal learning adeptly integrates visual and textual data, but its application to histopathology image and text analysis remains challenging, particularly with large, high-resolution images like gigapixel Whole Slide Images (WSIs). Current methods typically rely on manual region labeling or multi-stage learning to assemble local representations (e.g., patch-level) into global features (e.g.,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17824v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17824v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17824v1-abstract-full" style="display: none;"> Multi-modal learning adeptly integrates visual and textual data, but its application to histopathology image and text analysis remains challenging, particularly with large, high-resolution images like gigapixel Whole Slide Images (WSIs). Current methods typically rely on manual region labeling or multi-stage learning to assemble local representations (e.g., patch-level) into global features (e.g., slide-level). However, there is no effective way to integrate multi-scale image representations with text data in a seamless end-to-end process. In this study, we introduce Multi-Level Text-Guided Representation End-to-End Learning (mTREE). This novel text-guided approach effectively captures multi-scale WSI representations by utilizing information from accompanying textual pathology information. mTREE innovatively combines - the localization of key areas (global-to-local) and the development of a WSI-level image-text representation (local-to-global) - into a unified, end-to-end learning framework. In this model, textual information serves a dual purpose: firstly, functioning as an attention map to accurately identify key areas, and secondly, acting as a conduit for integrating textual features into the comprehensive representation of the image. Our study demonstrates the effectiveness of mTREE through quantitative analyses in two image-related tasks: classification and survival prediction, showcasing its remarkable superiority over baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17824v1-abstract-full').style.display = 'none'; document.getElementById('2405.17824v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09113">arXiv:2405.09113</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.09113">pdf</a>, <a href="https://arxiv.org/ps/2405.09113">ps</a>, <a href="https://arxiv.org/format/2405.09113">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient LLM Jailbreak via Adaptive Dense-to-sparse Constrained Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Weichen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianjun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenhe Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lijun Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yining Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Fredrikson%2C+M">Matt Fredrikson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09113v1-abstract-short" style="display: inline;"> Recent research indicates that large language models (LLMs) are susceptible to jailbreaking attacks that can generate harmful content. This paper introduces a novel token-level attack method, Adaptive Dense-to-Sparse Constrained Optimization (ADC), which effectively jailbreaks several open-source LLMs. Our approach relaxes the discrete jailbreak optimization into a continuous optimization and prog&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09113v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09113v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09113v1-abstract-full" style="display: none;"> Recent research indicates that large language models (LLMs) are susceptible to jailbreaking attacks that can generate harmful content. This paper introduces a novel token-level attack method, Adaptive Dense-to-Sparse Constrained Optimization (ADC), which effectively jailbreaks several open-source LLMs. Our approach relaxes the discrete jailbreak optimization into a continuous optimization and progressively increases the sparsity of the optimizing vectors. Consequently, our method effectively bridges the gap between discrete and continuous space optimization. Experimental results demonstrate that our method is more effective and efficient than existing token-level methods. On Harmbench, our method achieves state of the art attack success rate on seven out of eight LLMs. Code will be made available. Trigger Warning: This paper contains model behavior that can be offensive in nature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09113v1-abstract-full').style.display = 'none'; document.getElementById('2405.09113v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03652">arXiv:2405.03652</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.03652">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1117/1.JMI.11.4.044008">10.1117/1.JMI.11.4.044008 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Field-of-View Extension for Brain Diffusion MRI via Deep Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chenyu Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+S">Shunxing Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Michael Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Newlin%2C+N">Nancy Newlin</a>, <a href="/search/cs?searchtype=author&amp;query=Kanakaraj%2C+P">Praitayini Kanakaraj</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Rudravaram%2C+G">Gaurav Rudravaram</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&amp;query=Schilling%2C+K">Kurt Schilling</a>, <a href="/search/cs?searchtype=author&amp;query=Kukull%2C+W">Walter Kukull</a>, <a href="/search/cs?searchtype=author&amp;query=Toga%2C+A">Arthur Toga</a>, <a href="/search/cs?searchtype=author&amp;query=Archer%2C+D">Derek Archer</a>, <a href="/search/cs?searchtype=author&amp;query=Hohman%2C+T">Timothy Hohman</a>, <a href="/search/cs?searchtype=author&amp;query=Landman%2C+B">Bennett Landman</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiyuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03652v2-abstract-short" style="display: inline;"> Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of whole-brain tissue microstructure and connectivity can be severely impeded by an incomplete field-of-view (FOV). This work aims to develop a method for imputing the missing slices directly from existing dMRI scans with an incomplete FOV. We hypothesize that the imputed image with complete FOV can improve the whole-brain tracto&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03652v2-abstract-full').style.display = 'inline'; document.getElementById('2405.03652v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03652v2-abstract-full" style="display: none;"> Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of whole-brain tissue microstructure and connectivity can be severely impeded by an incomplete field-of-view (FOV). This work aims to develop a method for imputing the missing slices directly from existing dMRI scans with an incomplete FOV. We hypothesize that the imputed image with complete FOV can improve the whole-brain tractography for corrupted data with incomplete FOV. Therefore, our approach provides a desirable alternative to discarding the valuable dMRI data, enabling subsequent tractography analyses that would otherwise be challenging or unattainable with corrupted data. Approach: We propose a framework based on a deep generative model that estimates the absent brain regions in dMRI scans with incomplete FOV. The model is capable of learning both the diffusion characteristics in diffusion-weighted images (DWI) and the anatomical features evident in the corresponding structural images for efficiently imputing missing slices of DWI outside of incomplete FOV. Results: For evaluating the imputed slices, on the WRAP dataset the proposed framework achieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the NACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599, SSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as demonstrated by an increased average Dice score for 72 tracts (p &lt; 0.001) on both the WRAP and NACC datasets. Conclusions: Results suggest that the proposed framework achieved sufficient imputation performance in dMRI data with incomplete FOV for improving whole-brain tractography, thereby repairing the corrupted data. Our approach achieved more accurate whole-brain tractography results with extended and complete FOV and reduced the uncertainty when analyzing bundles associated with Alzheimer&#39;s Disease. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03652v2-abstract-full').style.display = 'none'; document.getElementById('2405.03652v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Medical Imaging, Vol. 11, Issue 4, 044008 (August 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19334">arXiv:2403.19334</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.19334">pdf</a>, <a href="https://arxiv.org/format/2403.19334">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Test-Time Domain Generalization for Face Anti-Spoofing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Q">Qianyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Ke-Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xuequan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lizhuang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19334v1-abstract-short" style="display: inline;"> Face Anti-Spoofing (FAS) is pivotal in safeguarding facial recognition systems against presentation attacks. While domain generalization (DG) methods have been developed to enhance FAS performance, they predominantly focus on learning domain-invariant features during training, which may not guarantee generalizability to unseen data that differs largely from the source distributions. Our insight is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19334v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19334v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19334v1-abstract-full" style="display: none;"> Face Anti-Spoofing (FAS) is pivotal in safeguarding facial recognition systems against presentation attacks. While domain generalization (DG) methods have been developed to enhance FAS performance, they predominantly focus on learning domain-invariant features during training, which may not guarantee generalizability to unseen data that differs largely from the source distributions. Our insight is that testing data can serve as a valuable resource to enhance the generalizability beyond mere evaluation for DG FAS. In this paper, we introduce a novel Test-Time Domain Generalization (TTDG) framework for FAS, which leverages the testing data to boost the model&#39;s generalizability. Our method, consisting of Test-Time Style Projection (TTSP) and Diverse Style Shifts Simulation (DSSS), effectively projects the unseen data to the seen domain space. In particular, we first introduce the innovative TTSP to project the styles of the arbitrarily unseen samples of the testing distribution to the known source space of the training distributions. We then design the efficient DSSS to synthesize diverse style shifts via learnable style bases with two specifically designed losses in a hyperspherical feature space. Our method eliminates the need for model updates at the test time and can be seamlessly integrated into not only the CNN but also ViT backbones. Comprehensive experiments on widely used cross-domain FAS benchmarks demonstrate our method&#39;s state-of-the-art performance and effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19334v1-abstract-full').style.display = 'none'; document.getElementById('2403.19334v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17870">arXiv:2403.17870</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17870">pdf</a>, <a href="https://arxiv.org/format/2403.17870">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Boosting Diffusion Models with Moving Average Sampling in Frequency Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Y">Yurui Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Q">Qi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yehao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Q">Qibin Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17870v1-abstract-short" style="display: inline;"> Diffusion models have recently brought a powerful revolution in image generation. Despite showing impressive generative capabilities, most of these models rely on the current sample to denoise the next one, possibly resulting in denoising instability. In this paper, we reinterpret the iterative denoising process as model optimization and leverage a moving average mechanism to ensemble all the prio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17870v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17870v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17870v1-abstract-full" style="display: none;"> Diffusion models have recently brought a powerful revolution in image generation. Despite showing impressive generative capabilities, most of these models rely on the current sample to denoise the next one, possibly resulting in denoising instability. In this paper, we reinterpret the iterative denoising process as model optimization and leverage a moving average mechanism to ensemble all the prior samples. Instead of simply applying moving average to the denoised samples at different timesteps, we first map the denoised samples to data space and then perform moving average to avoid distribution shift across timesteps. In view that diffusion models evolve the recovery from low-frequency components to high-frequency details, we further decompose the samples into different frequency components and execute moving average separately on each component. We name the complete approach &#34;Moving Average Sampling in Frequency domain (MASF)&#34;. MASF could be seamlessly integrated into mainstream pre-trained diffusion models and sampling schedules. Extensive experiments on both unconditional and conditional diffusion models demonstrate that our MASF leads to superior performances compared to the baselines, with almost negligible additional complexity cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17870v1-abstract-full').style.display = 'none'; document.getElementById('2403.17870v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17005">arXiv:2403.17005</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17005">pdf</a>, <a href="https://arxiv.org/format/2403.17005">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> TRIP: Temporal Residual Learning with Image Noise Prior for Image-to-Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhongwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+F">Fuchen Long</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Z">Zhaofan Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17005v1-abstract-short" style="display: inline;"> Recent advances in text-to-video generation have demonstrated the utility of powerful diffusion models. Nevertheless, the problem is not trivial when shaping diffusion models to animate static image (i.e., image-to-video generation). The difficulty originates from the aspect that the diffusion process of subsequent animated frames should not only preserve the faithful alignment with the given imag&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17005v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17005v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17005v1-abstract-full" style="display: none;"> Recent advances in text-to-video generation have demonstrated the utility of powerful diffusion models. Nevertheless, the problem is not trivial when shaping diffusion models to animate static image (i.e., image-to-video generation). The difficulty originates from the aspect that the diffusion process of subsequent animated frames should not only preserve the faithful alignment with the given image but also pursue temporal coherence among adjacent frames. To alleviate this, we present TRIP, a new recipe of image-to-video diffusion paradigm that pivots on image noise prior derived from static image to jointly trigger inter-frame relational reasoning and ease the coherent temporal modeling via temporal residual learning. Technically, the image noise prior is first attained through one-step backward diffusion process based on both static image and noised video latent codes. Next, TRIP executes a residual-like dual-path scheme for noise prediction: 1) a shortcut path that directly takes image noise prior as the reference noise of each frame to amplify the alignment between the first frame and subsequent frames; 2) a residual path that employs 3D-UNet over noised video and static image latent codes to enable inter-frame relational reasoning, thereby easing the learning of the residual noise for each frame. Furthermore, both reference and residual noise of each frame are dynamically merged via attention mechanism for final video generation. Extensive experiments on WebVid-10M, DTDB and MSR-VTT datasets demonstrate the effectiveness of our TRIP for image-to-video generation. Please see our project page at https://trip-i2v.github.io/TRIP/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17005v1-abstract-full').style.display = 'none'; document.getElementById('2403.17005v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024; Project page: https://trip-i2v.github.io/TRIP/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17004">arXiv:2403.17004</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17004">pdf</a>, <a href="https://arxiv.org/format/2403.17004">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SD-DiT: Unleashing the Power of Self-supervised Discrimination in Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+R">Rui Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yehao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhenglong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C+W">Chang Wen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17004v1-abstract-short" style="display: inline;"> Diffusion Transformer (DiT) has emerged as the new trend of generative diffusion models on image generation. In view of extremely slow convergence in typical DiT, recent breakthroughs have been driven by mask strategy that significantly improves the training efficiency of DiT with additional intra-image contextual learning. Despite this progress, mask strategy still suffers from two inherent limit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17004v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17004v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17004v1-abstract-full" style="display: none;"> Diffusion Transformer (DiT) has emerged as the new trend of generative diffusion models on image generation. In view of extremely slow convergence in typical DiT, recent breakthroughs have been driven by mask strategy that significantly improves the training efficiency of DiT with additional intra-image contextual learning. Despite this progress, mask strategy still suffers from two inherent limitations: (a) training-inference discrepancy and (b) fuzzy relations between mask reconstruction &amp; generative diffusion process, resulting in sub-optimal training of DiT. In this work, we address these limitations by novelly unleashing the self-supervised discrimination knowledge to boost DiT training. Technically, we frame our DiT in a teacher-student manner. The teacher-student discriminative pairs are built on the diffusion noises along the same Probability Flow Ordinary Differential Equation (PF-ODE). Instead of applying mask reconstruction loss over both DiT encoder and decoder, we decouple DiT encoder and decoder to separately tackle discriminative and generative objectives. In particular, by encoding discriminative pairs with student and teacher DiT encoders, a new discriminative loss is designed to encourage the inter-image alignment in the self-supervised embedding space. After that, student samples are fed into student DiT decoder to perform the typical generative diffusion task. Extensive experiments are conducted on ImageNet dataset, and our method achieves a competitive balance between training cost and generative capacity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17004v1-abstract-full').style.display = 'none'; document.getElementById('2403.17004v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17001">arXiv:2403.17001</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17001">pdf</a>, <a href="https://arxiv.org/format/2403.17001">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> VP3D: Unleashing 2D Visual Prompt for Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haibo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17001v1-abstract-short" style="display: inline;"> Recent innovations on text-to-3D generation have featured Score Distillation Sampling (SDS), which enables the zero-shot learning of implicit 3D models (NeRF) by directly distilling prior knowledge from 2D diffusion models. However, current SDS-based models still struggle with intricate text prompts and commonly result in distorted 3D models with unrealistic textures or cross-view inconsistency is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17001v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17001v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17001v1-abstract-full" style="display: none;"> Recent innovations on text-to-3D generation have featured Score Distillation Sampling (SDS), which enables the zero-shot learning of implicit 3D models (NeRF) by directly distilling prior knowledge from 2D diffusion models. However, current SDS-based models still struggle with intricate text prompts and commonly result in distorted 3D models with unrealistic textures or cross-view inconsistency issues. In this work, we introduce a novel Visual Prompt-guided text-to-3D diffusion model (VP3D) that explicitly unleashes the visual appearance knowledge in 2D visual prompt to boost text-to-3D generation. Instead of solely supervising SDS with text prompt, VP3D first capitalizes on 2D diffusion model to generate a high-quality image from input text, which subsequently acts as visual prompt to strengthen SDS optimization with explicit visual appearance. Meanwhile, we couple the SDS optimization with additional differentiable reward function that encourages rendering images of 3D models to better visually align with 2D visual prompt and semantically match with text prompt. Through extensive experiments, we show that the 2D Visual Prompt in our VP3D significantly eases the learning of visual appearance of 3D models and thus leads to higher visual fidelity with more detailed textures. It is also appealing in view that when replacing the self-generating visual prompt with a given reference image, VP3D is able to trigger a new task of stylized text-to-3D generation. Our project page is available at https://vp3d-cvpr24.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17001v1-abstract-full').style.display = 'none'; document.getElementById('2403.17001v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024; Project page: https://vp3d-cvpr24.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17000">arXiv:2403.17000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17000">pdf</a>, <a href="https://arxiv.org/format/2403.17000">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Learning Spatial Adaptation and Temporal Coherence in Diffusion Models for Video Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhikai Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+F">Fuchen Long</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Z">Zhaofan Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Wengang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17000v1-abstract-short" style="display: inline;"> Diffusion models are just at a tipping point for image super-resolution task. Nevertheless, it is not trivial to capitalize on diffusion models for video super-resolution which necessitates not only the preservation of visual appearance from low-resolution to high-resolution videos, but also the temporal consistency across video frames. In this paper, we propose a novel approach, pursuing Spatial&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17000v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17000v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17000v1-abstract-full" style="display: none;"> Diffusion models are just at a tipping point for image super-resolution task. Nevertheless, it is not trivial to capitalize on diffusion models for video super-resolution which necessitates not only the preservation of visual appearance from low-resolution to high-resolution videos, but also the temporal consistency across video frames. In this paper, we propose a novel approach, pursuing Spatial Adaptation and Temporal Coherence (SATeCo), for video super-resolution. SATeCo pivots on learning spatial-temporal guidance from low-resolution videos to calibrate both latent-space high-resolution video denoising and pixel-space video reconstruction. Technically, SATeCo freezes all the parameters of the pre-trained UNet and VAE, and only optimizes two deliberately-designed spatial feature adaptation (SFA) and temporal feature alignment (TFA) modules, in the decoder of UNet and VAE. SFA modulates frame features via adaptively estimating affine parameters for each pixel, guaranteeing pixel-wise guidance for high-resolution frame synthesis. TFA delves into feature interaction within a 3D local window (tubelet) through self-attention, and executes cross-attention between tubelet and its low-resolution counterpart to guide temporal feature alignment. Extensive experiments conducted on the REDS4 and Vid4 datasets demonstrate the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17000v1-abstract-full').style.display = 'none'; document.getElementById('2403.17000v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11999">arXiv:2403.11999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.11999">pdf</a>, <a href="https://arxiv.org/format/2403.11999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> HIRI-ViT: Scaling Vision Transformer with High Resolution Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yehao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11999v1-abstract-short" style="display: inline;"> The hybrid deep models of Vision Transformer (ViT) and Convolution Neural Network (CNN) have emerged as a powerful class of backbones for vision tasks. Scaling up the input resolution of such hybrid backbones naturally strengthes model capacity, but inevitably suffers from heavy computational cost that scales quadratically. Instead, we present a new hybrid backbone with HIgh-Resolution Inputs (nam&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11999v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11999v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11999v1-abstract-full" style="display: none;"> The hybrid deep models of Vision Transformer (ViT) and Convolution Neural Network (CNN) have emerged as a powerful class of backbones for vision tasks. Scaling up the input resolution of such hybrid backbones naturally strengthes model capacity, but inevitably suffers from heavy computational cost that scales quadratically. Instead, we present a new hybrid backbone with HIgh-Resolution Inputs (namely HIRI-ViT), that upgrades prevalent four-stage ViT to five-stage ViT tailored for high-resolution inputs. HIRI-ViT is built upon the seminal idea of decomposing the typical CNN operations into two parallel CNN branches in a cost-efficient manner. One high-resolution branch directly takes primary high-resolution features as inputs, but uses less convolution operations. The other low-resolution branch first performs down-sampling and then utilizes more convolution operations over such low-resolution features. Experiments on both recognition task (ImageNet-1K dataset) and dense prediction tasks (COCO and ADE20K datasets) demonstrate the superiority of HIRI-ViT. More remarkably, under comparable computational cost ($\sim$5.0 GFLOPs), HIRI-ViT achieves to-date the best published Top-1 accuracy of 84.3% on ImageNet with 448$\times$448 inputs, which absolutely improves 83.4% of iFormer-S by 0.9% with 224$\times$224 inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11999v1-abstract-full').style.display = 'none'; document.getElementById('2403.11999v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.19286">arXiv:2402.19286</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.19286">pdf</a>, <a href="https://arxiv.org/format/2402.19286">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PrPSeg: Universal Proposition Learning for Panoramic Renal Pathology Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+J">Jialin Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lining Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yifei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.19286v2-abstract-short" style="display: inline;"> Understanding the anatomy of renal pathology is crucial for advancing disease diagnostics, treatment evaluation, and clinical research. The complex kidney system comprises various components across multiple levels, including regions (cortex, medulla), functional units (glomeruli, tubules), and cells (podocytes, mesangial cells in glomerulus). Prior studies have predominantly overlooked the intrica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.19286v2-abstract-full').style.display = 'inline'; document.getElementById('2402.19286v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.19286v2-abstract-full" style="display: none;"> Understanding the anatomy of renal pathology is crucial for advancing disease diagnostics, treatment evaluation, and clinical research. The complex kidney system comprises various components across multiple levels, including regions (cortex, medulla), functional units (glomeruli, tubules), and cells (podocytes, mesangial cells in glomerulus). Prior studies have predominantly overlooked the intricate spatial interrelations among objects from clinical knowledge. In this research, we introduce a novel universal proposition learning approach, called panoramic renal pathology segmentation (PrPSeg), designed to segment comprehensively panoramic structures within kidney by integrating extensive knowledge of kidney anatomy. In this paper, we propose (1) the design of a comprehensive universal proposition matrix for renal pathology, facilitating the incorporation of classification and spatial relationships into the segmentation process; (2) a token-based dynamic head single network architecture, with the improvement of the partial label image segmentation and capability for future data enlargement; and (3) an anatomy loss function, quantifying the inter-object relationships across the kidney. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.19286v2-abstract-full').style.display = 'none'; document.getElementById('2402.19286v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE / CVF Computer Vision and Pattern Recognition Conference 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18236">arXiv:2402.18236</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.18236">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image2Flow: A hybrid image and graph convolutional neural network for rapid patient-specific pulmonary artery segmentation and CFD flow field calculation from 3D cardiac MRI data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tina Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Pajaziti%2C+E">Endrit Pajaziti</a>, <a href="/search/cs?searchtype=author&amp;query=Quail%2C+M">Michael Quail</a>, <a href="/search/cs?searchtype=author&amp;query=Schievano%2C+S">Silvia Schievano</a>, <a href="/search/cs?searchtype=author&amp;query=Steeden%2C+J+A">Jennifer A Steeden</a>, <a href="/search/cs?searchtype=author&amp;query=Muthurangu%2C+V">Vivek Muthurangu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18236v1-abstract-short" style="display: inline;"> Computational fluid dynamics (CFD) can be used for evaluation of hemodynamics. However, its routine use is limited by labor-intensive manual segmentation, CFD mesh creation, and time-consuming simulation. This study aims to train a deep learning model to both generate patient-specific volume-meshes of the pulmonary artery from 3D cardiac MRI data and directly estimate CFD flow fields. This study&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18236v1-abstract-full').style.display = 'inline'; document.getElementById('2402.18236v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18236v1-abstract-full" style="display: none;"> Computational fluid dynamics (CFD) can be used for evaluation of hemodynamics. However, its routine use is limited by labor-intensive manual segmentation, CFD mesh creation, and time-consuming simulation. This study aims to train a deep learning model to both generate patient-specific volume-meshes of the pulmonary artery from 3D cardiac MRI data and directly estimate CFD flow fields. This study used 135 3D cardiac MRIs from both a public and private dataset. The pulmonary arteries in the MRIs were manually segmented and converted into volume-meshes. CFD simulations were performed on ground truth meshes and interpolated onto point-point correspondent meshes to create the ground truth dataset. The dataset was split 85/10/15 for training, validation and testing. Image2Flow, a hybrid image and graph convolutional neural network, was trained to transform a pulmonary artery template to patient-specific anatomy and CFD values. Image2Flow was evaluated in terms of segmentation and accuracy of CFD predicted was assessed using node-wise comparisons. Centerline comparisons of Image2Flow and CFD simulations performed using machine learning segmentation were also performed. Image2Flow achieved excellent segmentation accuracy with a median Dice score of 0.9 (IQR: 0.86-0.92). The median node-wise normalized absolute error for pressure and velocity magnitude was 11.98% (IQR: 9.44-17.90%) and 8.06% (IQR: 7.54-10.41), respectively. Centerline analysis showed no significant difference between the Image2Flow and conventional CFD simulated on machine learning-generated volume-meshes. This proof-of-concept study has shown it is possible to simultaneously perform patient specific volume-mesh based segmentation and pressure and flow field estimation. Image2Flow completes segmentation and CFD in ~205ms, which ~7000 times faster than manual methods, making it more feasible in a clinical environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18236v1-abstract-full').style.display = 'none'; document.getElementById('2402.18236v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 7 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09807">arXiv:2402.09807</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.09807">pdf</a>, <a href="https://arxiv.org/format/2402.09807">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Two trust region type algorithms for solving nonconvex-strongly concave minimax problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tongliang Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09807v1-abstract-short" style="display: inline;"> In this paper, we propose a Minimax Trust Region (MINIMAX-TR) algorithm and a Minimax Trust Region Algorithm with Contractions and Expansions(MINIMAX-TRACE) algorithm for solving nonconvex-strongly concave minimax problems. Both algorithms can find an $(蔚, \sqrt蔚)$-second order stationary point(SSP) within $\mathcal{O}(蔚^{-1.5})$ iterations, which matches the best well known iteration complexity. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09807v1-abstract-full" style="display: none;"> In this paper, we propose a Minimax Trust Region (MINIMAX-TR) algorithm and a Minimax Trust Region Algorithm with Contractions and Expansions(MINIMAX-TRACE) algorithm for solving nonconvex-strongly concave minimax problems. Both algorithms can find an $(蔚, \sqrt蔚)$-second order stationary point(SSP) within $\mathcal{O}(蔚^{-1.5})$ iterations, which matches the best well known iteration complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09807v1-abstract-full').style.display = 'none'; document.getElementById('2402.09807v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 90C47; 90C26; 90C30 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01256">arXiv:2401.01256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.01256">pdf</a>, <a href="https://arxiv.org/format/2401.01256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VideoStudio: Generating Consistent-Content and Multi-Scene Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Long%2C+F">Fuchen Long</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Z">Zhaofan Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01256v2-abstract-short" style="display: inline;"> The recent innovations and breakthroughs in diffusion models have significantly expanded the possibilities of generating high-quality videos for the given prompts. Most existing works tackle the single-scene scenario with only one video event occurring in a single background. Extending to generate multi-scene videos nevertheless is not trivial and necessitates to nicely manage the logic in between&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01256v2-abstract-full').style.display = 'inline'; document.getElementById('2401.01256v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01256v2-abstract-full" style="display: none;"> The recent innovations and breakthroughs in diffusion models have significantly expanded the possibilities of generating high-quality videos for the given prompts. Most existing works tackle the single-scene scenario with only one video event occurring in a single background. Extending to generate multi-scene videos nevertheless is not trivial and necessitates to nicely manage the logic in between while preserving the consistent visual appearance of key content across video scenes. In this paper, we propose a novel framework, namely VideoStudio, for consistent-content and multi-scene video generation. Technically, VideoStudio leverages Large Language Models (LLM) to convert the input prompt into comprehensive multi-scene script that benefits from the logical knowledge learnt by LLM. The script for each scene includes a prompt describing the event, the foreground/background entities, as well as camera movement. VideoStudio identifies the common entities throughout the script and asks LLM to detail each entity. The resultant entity description is then fed into a text-to-image model to generate a reference image for each entity. Finally, VideoStudio outputs a multi-scene video by generating each scene video via a diffusion process that takes the reference images, the descriptive prompt of the event and camera movement into account. The diffusion model incorporates the reference images as the condition and alignment to strengthen the content consistency of multi-scene videos. Extensive experiments demonstrate that VideoStudio outperforms the SOTA video generation models in terms of visual quality, content consistency, and user preference. Source code is available at \url{https://github.com/FuchenUSTC/VideoStudio}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01256v2-abstract-full').style.display = 'none'; document.getElementById('2401.01256v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Source code is available at https://github.com/FuchenUSTC/VideoStudio</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05464">arXiv:2311.05464</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.05464">pdf</a>, <a href="https://arxiv.org/format/2311.05464">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> 3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Haibo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhineng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05464v1-abstract-short" style="display: inline;"> 3D content creation via text-driven stylization has played a fundamental challenge to multimedia and graphics community. Recent advances of cross-modal foundation models (e.g., CLIP) have made this problem feasible. Those approaches commonly leverage CLIP to align the holistic semantics of stylized mesh with the given text prompt. Nevertheless, it is not trivial to enable more controllable styliza&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05464v1-abstract-full').style.display = 'inline'; document.getElementById('2311.05464v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05464v1-abstract-full" style="display: none;"> 3D content creation via text-driven stylization has played a fundamental challenge to multimedia and graphics community. Recent advances of cross-modal foundation models (e.g., CLIP) have made this problem feasible. Those approaches commonly leverage CLIP to align the holistic semantics of stylized mesh with the given text prompt. Nevertheless, it is not trivial to enable more controllable stylization of fine-grained details in 3D meshes solely based on such semantic-level cross-modal supervision. In this work, we propose a new 3DStyle-Diffusion model that triggers fine-grained stylization of 3D meshes with additional controllable appearance and geometric guidance from 2D Diffusion models. Technically, 3DStyle-Diffusion first parameterizes the texture of 3D mesh into reflectance properties and scene lighting using implicit MLP networks. Meanwhile, an accurate depth map of each sampled view is achieved conditioned on 3D mesh. Then, 3DStyle-Diffusion leverages a pre-trained controllable 2D Diffusion model to guide the learning of rendered images, encouraging the synthesized image of each view semantically aligned with text prompt and geometrically consistent with depth map. This way elegantly integrates both image rendering via implicit MLP networks and diffusion process of image synthesis in an end-to-end fashion, enabling a high-quality fine-grained stylization of 3D meshes. We also build a new dataset derived from Objaverse and the evaluation protocol for this task. Through both qualitative and quantitative experiments, we validate the capability of our 3DStyle-Diffusion. Source code and data are available at \url{https://github.com/yanghb22-fdu/3DStyle-Diffusion-Official}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05464v1-abstract-full').style.display = 'none'; document.getElementById('2311.05464v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Multimedia 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05463">arXiv:2311.05463</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.05463">pdf</a>, <a href="https://arxiv.org/format/2311.05463">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jingwen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05463v1-abstract-short" style="display: inline;"> Recently, the multimedia community has witnessed the rise of diffusion models trained on large-scale multi-modal data for visual content creation, particularly in the field of text-to-image generation. In this paper, we propose a new task for ``stylizing&#39;&#39; text-to-image models, namely text-driven stylized image generation, that further enhances editability in content creation. Given input text pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05463v1-abstract-full').style.display = 'inline'; document.getElementById('2311.05463v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05463v1-abstract-full" style="display: none;"> Recently, the multimedia community has witnessed the rise of diffusion models trained on large-scale multi-modal data for visual content creation, particularly in the field of text-to-image generation. In this paper, we propose a new task for ``stylizing&#39;&#39; text-to-image models, namely text-driven stylized image generation, that further enhances editability in content creation. Given input text prompt and style image, this task aims to produce stylized images which are both semantically relevant to input text prompt and meanwhile aligned with the style image in style. To achieve this, we present a new diffusion model (ControlStyle) via upgrading a pre-trained text-to-image model with a trainable modulation network enabling more conditions of text prompts and style images. Moreover, diffusion style and content regularizations are simultaneously introduced to facilitate the learning of this modulation network with these diffusion priors, pursuing high-quality stylized text-to-image generation. Extensive experiments demonstrate the effectiveness of our ControlStyle in producing more visually pleasing and artistic results, surpassing a simple combination of text-to-image model and conventional style transfer techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05463v1-abstract-full').style.display = 'none'; document.getElementById('2311.05463v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Multimedia 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05461">arXiv:2311.05461</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.05461">pdf</a>, <a href="https://arxiv.org/format/2311.05461">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Control3D: Towards Controllable Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yingwei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yehao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05461v1-abstract-short" style="display: inline;"> Recent remarkable advances in large-scale text-to-image diffusion models have inspired a significant breakthrough in text-to-3D generation, pursuing 3D content creation solely from a given text prompt. However, existing text-to-3D techniques lack a crucial ability in the creative process: interactively control and shape the synthetic 3D contents according to users&#39; desired specifications (e.g., sk&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05461v1-abstract-full').style.display = 'inline'; document.getElementById('2311.05461v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05461v1-abstract-full" style="display: none;"> Recent remarkable advances in large-scale text-to-image diffusion models have inspired a significant breakthrough in text-to-3D generation, pursuing 3D content creation solely from a given text prompt. However, existing text-to-3D techniques lack a crucial ability in the creative process: interactively control and shape the synthetic 3D contents according to users&#39; desired specifications (e.g., sketch). To alleviate this issue, we present the first attempt for text-to-3D generation conditioning on the additional hand-drawn sketch, namely Control3D, which enhances controllability for users. In particular, a 2D conditioned diffusion model (ControlNet) is remoulded to guide the learning of 3D scene parameterized as NeRF, encouraging each view of 3D scene aligned with the given text prompt and hand-drawn sketch. Moreover, we exploit a pre-trained differentiable photo-to-sketch model to directly estimate the sketch of the rendered image over synthetic 3D scene. Such estimated sketch along with each sampled view is further enforced to be geometrically consistent with the given sketch, pursuing better controllable text-to-3D generation. Through extensive experiments, we demonstrate that our proposal can generate accurate and faithful 3D scenes that align closely with the input text prompts and sketches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05461v1-abstract-full').style.display = 'none'; document.getElementById('2311.05461v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM Multimedia 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.02495">arXiv:2311.02495</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.02495">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41598-024-61189-x">10.1038/s41598-024-61189-x <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Uncertainty Quantification in Multivariable Regression for Material Property Prediction with Bayesian Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Longze Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+J">Jiang Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Vakanski%2C+A">Aleksandar Vakanski</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yachun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tiankai Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xian%2C+M">Min Xian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.02495v4-abstract-short" style="display: inline;"> With the increased use of data-driven approaches and machine learning-based methods in material science, the importance of reliable uncertainty quantification (UQ) of the predicted variables for informed decision-making cannot be overstated. UQ in material property prediction poses unique challenges, including the multi-scale and multi-physics nature of advanced materials, intricate interactions b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02495v4-abstract-full').style.display = 'inline'; document.getElementById('2311.02495v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.02495v4-abstract-full" style="display: none;"> With the increased use of data-driven approaches and machine learning-based methods in material science, the importance of reliable uncertainty quantification (UQ) of the predicted variables for informed decision-making cannot be overstated. UQ in material property prediction poses unique challenges, including the multi-scale and multi-physics nature of advanced materials, intricate interactions between numerous factors, limited availability of large curated datasets for model training, etc. Recently, Bayesian Neural Networks (BNNs) have emerged as a promising approach for UQ, offering a probabilistic framework for capturing uncertainties within neural networks. In this work, we introduce an approach for UQ within physics-informed BNNs, which integrates knowledge from governing laws in material modeling to guide the models toward physically consistent predictions. To evaluate the effectiveness of this approach, we present case studies for predicting the creep rupture life of steel alloys. Experimental validation with three datasets of collected measurements from creep tests demonstrates the ability of BNNs to produce accurate point and uncertainty estimates that are competitive or exceed the performance of the conventional method of Gaussian Process Regression. Similarly, we evaluated the suitability of BNNs for UQ in an active learning application and reported competitive performance. The most promising framework for creep life prediction is BNNs based on Markov Chain Monte Carlo approximation of the posterior distribution of network parameters, as it provided more reliable results in comparison to BNNs based on variational inference approximation or related NNs with probabilistic outputs. The codes are available at: https://github.com/avakanski/Creep-uncertainty-quantification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02495v4-abstract-full').style.display = 'none'; document.getElementById('2311.02495v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Scientific Reports, 14(1):10543, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05185">arXiv:2310.05185</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.05185">pdf</a>, <a href="https://arxiv.org/format/2310.05185">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Text2NKG: Fine-Grained N-ary Relation Extraction for N-ary relational Knowledge Graph Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Haoran Luo</a>, <a href="/search/cs?searchtype=author&amp;query=E%2C+H">Haihong E</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuhao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Tianyu Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yikai Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Z">Zichen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wentai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+K">Kaiyang Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+S">Shiyao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+M">Meina Song</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yifan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Tuan%2C+L+A">Luu Anh Tuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05185v3-abstract-short" style="display: inline;"> Beyond traditional binary relational facts, n-ary relational knowledge graphs (NKGs) are comprised of n-ary relational facts containing more than two entities, which are closer to real-world facts with broader applications. However, the construction of NKGs remains at a coarse-grained level, which is always in a single schema, ignoring the order and variable arity of entities. To address these res&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05185v3-abstract-full').style.display = 'inline'; document.getElementById('2310.05185v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05185v3-abstract-full" style="display: none;"> Beyond traditional binary relational facts, n-ary relational knowledge graphs (NKGs) are comprised of n-ary relational facts containing more than two entities, which are closer to real-world facts with broader applications. However, the construction of NKGs remains at a coarse-grained level, which is always in a single schema, ignoring the order and variable arity of entities. To address these restrictions, we propose Text2NKG, a novel fine-grained n-ary relation extraction framework for n-ary relational knowledge graph construction. We introduce a span-tuple classification approach with hetero-ordered merging and output merging to accomplish fine-grained n-ary relation extraction in different arity. Furthermore, Text2NKG supports four typical NKG schemas: hyper-relational schema, event-based schema, role-based schema, and hypergraph-based schema, with high flexibility and practicality. The experimental results demonstrate that Text2NKG achieves state-of-the-art performance in F1 scores on the fine-grained n-ary relation extraction benchmark. Our code and datasets are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05185v3-abstract-full').style.display = 'none'; document.getElementById('2310.05185v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024 main conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05125">arXiv:2310.05125</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.05125">pdf</a>, <a href="https://arxiv.org/format/2310.05125">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2023.3321535">10.1109/TMM.2023.3321535 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bidirectional Knowledge Reconfiguration for Lightweight Point Cloud Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peipei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+X">Xing Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yibo Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Man Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Ting Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Mei%2C+T">Tao Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05125v1-abstract-short" style="display: inline;"> Point cloud analysis faces computational system overhead, limiting its application on mobile or edge devices. Directly employing small models may result in a significant drop in performance since it is difficult for a small model to adequately capture local structure and global shape information simultaneously, which are essential clues for point cloud analysis. This paper explores feature distill&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05125v1-abstract-full').style.display = 'inline'; document.getElementById('2310.05125v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05125v1-abstract-full" style="display: none;"> Point cloud analysis faces computational system overhead, limiting its application on mobile or edge devices. Directly employing small models may result in a significant drop in performance since it is difficult for a small model to adequately capture local structure and global shape information simultaneously, which are essential clues for point cloud analysis. This paper explores feature distillation for lightweight point cloud models. To mitigate the semantic gap between the lightweight student and the cumbersome teacher, we propose bidirectional knowledge reconfiguration (BKR) to distill informative contextual knowledge from the teacher to the student. Specifically, a top-down knowledge reconfiguration and a bottom-up knowledge reconfiguration are developed to inherit diverse local structure information and consistent global shape knowledge from the teacher, respectively. However, due to the farthest point sampling in most point cloud models, the intermediate features between teacher and student are misaligned, deteriorating the feature distillation performance. To eliminate it, we propose a feature mover&#39;s distance (FMD) loss based on optimal transportation, which can measure the distance between unordered point cloud features effectively. Extensive experiments conducted on shape classification, part segmentation, and semantic segmentation benchmarks demonstrate the universality and superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05125v1-abstract-full').style.display = 'none'; document.getElementById('2310.05125v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Multimedia (TMM)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Multimedia ( Early Access ), 02 October 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11132">arXiv:2309.11132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.11132">pdf</a>, <a href="https://arxiv.org/format/2309.11132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Pseudo Learning for Open-World DeepFake Attribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhimin Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+B">Bangjie Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+R">Ran Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lizhuang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11132v1-abstract-short" style="display: inline;"> The challenge in sourcing attribution for forgery faces has gained widespread attention due to the rapid development of generative techniques. While many recent works have taken essential steps on GAN-generated faces, more threatening attacks related to identity swapping or expression transferring are still overlooked. And the forgery traces hidden in unknown attacks from the open-world unlabeled&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11132v1-abstract-full').style.display = 'inline'; document.getElementById('2309.11132v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11132v1-abstract-full" style="display: none;"> The challenge in sourcing attribution for forgery faces has gained widespread attention due to the rapid development of generative techniques. While many recent works have taken essential steps on GAN-generated faces, more threatening attacks related to identity swapping or expression transferring are still overlooked. And the forgery traces hidden in unknown attacks from the open-world unlabeled faces still remain under-explored. To push the related frontier research, we introduce a new benchmark called Open-World DeepFake Attribution (OW-DFA), which aims to evaluate attribution performance against various types of fake faces under open-world scenarios. Meanwhile, we propose a novel framework named Contrastive Pseudo Learning (CPL) for the OW-DFA task through 1) introducing a Global-Local Voting module to guide the feature alignment of forged faces with different manipulated regions, 2) designing a Confidence-based Soft Pseudo-label strategy to mitigate the pseudo-noise caused by similar methods in unlabeled set. In addition, we extend the CPL framework with a multi-stage paradigm that leverages pre-train technique and iterative learning to further enhance traceability performance. Extensive experiments verify the superiority of our proposed method on the OW-DFA and also demonstrate the interpretability of deepfake attribution task and its impact on improving the security of deepfake detection area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11132v1-abstract-full').style.display = 'none'; document.getElementById('2309.11132v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 7 figures, ICCV 2023</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yao%2C+T&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10