CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 676 results for author: <span class="mathjax">Cao, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Cao%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Cao, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Cao%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Cao, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cao%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18288">arXiv:2411.18288</a> <span> [<a href="https://arxiv.org/pdf/2411.18288">pdf</a>, <a href="https://arxiv.org/format/2411.18288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Multispectral Object Detection: A Bag of Tricks and Comprehensive Benchmarks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chen Zhou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junfeng Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yanyan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18288v1-abstract-short" style="display: inline;"> Multispectral object detection, utilizing RGB and TIR (thermal infrared) modalities, is widely recognized as a challenging task. It requires not only the effective extraction of features from both modalities and robust fusion strategies, but also the ability to address issues such as spectral discrepancies, spatial misalignment, and environmental dependencies between RGB and TIR images. These chal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18288v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18288v1-abstract-full" style="display: none;"> Multispectral object detection, utilizing RGB and TIR (thermal infrared) modalities, is widely recognized as a challenging task. It requires not only the effective extraction of features from both modalities and robust fusion strategies, but also the ability to address issues such as spectral discrepancies, spatial misalignment, and environmental dependencies between RGB and TIR images. These challenges significantly hinder the generalization of multispectral detection systems across diverse scenarios. Although numerous studies have attempted to overcome these limitations, it remains difficult to clearly distinguish the performance gains of multispectral detection systems from the impact of these "optimization techniques". Worse still, despite the rapid emergence of high-performing single-modality detection models, there is still a lack of specialized training techniques that can effectively adapt these models for multispectral detection tasks. The absence of a standardized benchmark with fair and consistent experimental setups also poses a significant barrier to evaluating the effectiveness of new approaches. To this end, we propose the first fair and reproducible benchmark specifically designed to evaluate the training "techniques", which systematically classifies existing multispectral object detection methods, investigates their sensitivity to hyper-parameters, and standardizes the core configurations. A comprehensive evaluation is conducted across multiple representative multispectral object detection datasets, utilizing various backbone networks and detection frameworks. Additionally, we introduce an efficient and easily deployable multispectral object detection framework that can seamlessly optimize high-performing single-modality models into dual-modality models, integrating our advanced training techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18288v1-abstract-full').style.display = 'none'; document.getElementById('2411.18288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17711">arXiv:2411.17711</a> <span> [<a href="https://arxiv.org/pdf/2411.17711">pdf</a>, <a href="https://arxiv.org/format/2411.17711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AnyECG: Foundational Models for Electrocardiogram Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yaojun Hu</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+H">Haochao Ying</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James Matthew Rehg</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jimeng Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17711v1-abstract-short" style="display: inline;"> Electrocardiogram (ECG), a non-invasive and affordable tool for cardiac monitoring, is highly sensitive in detecting acute heart attacks. However, due to the lengthy nature of ECG recordings, numerous machine learning methods have been developed for automated heart disease detection to reduce human workload. Despite these efforts, performance remains suboptimal. A key obstacle is the inherent comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17711v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17711v1-abstract-full" style="display: none;"> Electrocardiogram (ECG), a non-invasive and affordable tool for cardiac monitoring, is highly sensitive in detecting acute heart attacks. However, due to the lengthy nature of ECG recordings, numerous machine learning methods have been developed for automated heart disease detection to reduce human workload. Despite these efforts, performance remains suboptimal. A key obstacle is the inherent complexity of ECG data, which includes heterogeneity (e.g., varying sampling rates), high levels of noise, demographic-related pattern shifts, and intricate rhythm-event associations. To overcome these challenges, this paper introduces AnyECG, a foundational model designed to extract robust representations from any real-world ECG data. Specifically, a tailored ECG Tokenizer encodes each fixed-duration ECG fragment into a token and, guided by proxy tasks, converts noisy, continuous ECG features into discrete, compact, and clinically meaningful local rhythm codes. These codes encapsulate basic morphological, frequency, and demographic information (e.g., sex), effectively mitigating signal noise. We further pre-train the AnyECG to learn rhythmic pattern associations across ECG tokens, enabling the capture of cardiac event semantics. By being jointly pre-trained on diverse ECG data sources, AnyECG is capable of generalizing across a wide range of downstream tasks where ECG signals are recorded from various devices and scenarios. Experimental results in anomaly detection, arrhythmia detection, corrupted lead generation, and ultra-long ECG signal analysis demonstrate that AnyECG learns common ECG knowledge from data and significantly outperforms cutting-edge methods in each respective task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17711v1-abstract-full').style.display = 'none'; document.getElementById('2411.17711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16733">arXiv:2411.16733</a> <span> [<a href="https://arxiv.org/pdf/2411.16733">pdf</a>, <a href="https://arxiv.org/format/2411.16733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Satellite Image Road Graph Extraction: A Global-Scale Dataset and A Novel Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+P">Pan Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiyu Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jing Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xueru Bai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16733v1-abstract-short" style="display: inline;"> Recently, road graph extraction has garnered increasing attention due to its crucial role in autonomous driving, navigation, etc. However, accurately and efficiently extracting road graphs remains a persistent challenge, primarily due to the severe scarcity of labeled data. To address this limitation, we collect a global-scale satellite road graph extraction dataset, i.e. Global-Scale dataset. Spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16733v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16733v1-abstract-full" style="display: none;"> Recently, road graph extraction has garnered increasing attention due to its crucial role in autonomous driving, navigation, etc. However, accurately and efficiently extracting road graphs remains a persistent challenge, primarily due to the severe scarcity of labeled data. To address this limitation, we collect a global-scale satellite road graph extraction dataset, i.e. Global-Scale dataset. Specifically, the Global-Scale dataset is $\sim20 \times$ larger than the largest existing public road extraction dataset and spans over 13,800 $km^2$ globally. Additionally, we develop a novel road graph extraction model, i.e. SAM-Road++, which adopts a node-guided resampling method to alleviate the mismatch issue between training and inference in SAM-Road, a pioneering state-of-the-art road graph extraction model. Furthermore, we propose a simple yet effective ``extended-line'' strategy in SAM-Road++ to mitigate the occlusion issue on the road. Extensive experiments demonstrate the validity of the collected Global-Scale dataset and the proposed SAM-Road++ method, particularly highlighting its superior predictive power in unseen regions. The dataset and code are available at \url{https://github.com/earth-insights/samroadplus}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16733v1-abstract-full').style.display = 'none'; document.getElementById('2411.16733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16207">arXiv:2411.16207</a> <span> [<a href="https://arxiv.org/pdf/2411.16207">pdf</a>, <a href="https://arxiv.org/format/2411.16207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Can Encrypted Images Still Train Neural Networks? Investigating Image Information and Random Vortex Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">XiaoKai Cao</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+W">WenJin Mo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">ChangDong Wang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+J">JianHuang Lai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qiong Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16207v1-abstract-short" style="display: inline;"> Vision is one of the essential sources through which humans acquire information. In this paper, we establish a novel framework for measuring image information content to evaluate the variation in information content during image transformations. Within this framework, we design a nonlinear function to calculate the neighboring information content of pixels at different distances, and then use this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16207v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16207v1-abstract-full" style="display: none;"> Vision is one of the essential sources through which humans acquire information. In this paper, we establish a novel framework for measuring image information content to evaluate the variation in information content during image transformations. Within this framework, we design a nonlinear function to calculate the neighboring information content of pixels at different distances, and then use this information to measure the overall information content of the image. Hence, we define a function to represent the variation in information content during image transformations. Additionally, we utilize this framework to prove the conclusion that swapping the positions of any two pixels reduces the image's information content. Furthermore, based on the aforementioned framework, we propose a novel image encryption algorithm called Random Vortex Transformation. This algorithm encrypts the image using random functions while preserving the neighboring information of the pixels. The encrypted images are difficult for the human eye to distinguish, yet they allow for direct training of the encrypted images using machine learning methods. Experimental verification demonstrates that training on the encrypted dataset using ResNet and Vision Transformers only results in a decrease in accuracy ranging from 0.3\% to 6.5\% compared to the original data, while ensuring the security of the data. Furthermore, there is a positive correlation between the rate of information loss in the images and the rate of accuracy loss, further supporting the validity of the proposed image information content measurement framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16207v1-abstract-full').style.display = 'none'; document.getElementById('2411.16207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16198">arXiv:2411.16198</a> <span> [<a href="https://arxiv.org/pdf/2411.16198">pdf</a>, <a href="https://arxiv.org/format/2411.16198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interpreting Object-level Foundation Models via Visual Precision Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingzhi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiming Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Maosen Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zheng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hua Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16198v1-abstract-short" style="display: inline;"> Advances in multimodal pre-training have propelled object-level foundation models, such as Grounding DINO and Florence-2, in tasks like visual grounding and object detection. However, interpreting these models\' decisions has grown increasingly challenging. Existing interpretable attribution methods for object-level task interpretation have notable limitations: (1) gradient-based methods lack prec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16198v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16198v1-abstract-full" style="display: none;"> Advances in multimodal pre-training have propelled object-level foundation models, such as Grounding DINO and Florence-2, in tasks like visual grounding and object detection. However, interpreting these models\' decisions has grown increasingly challenging. Existing interpretable attribution methods for object-level task interpretation have notable limitations: (1) gradient-based methods lack precise localization due to visual-textual fusion in foundation models, and (2) perturbation-based methods produce noisy saliency maps, limiting fine-grained interpretability. To address these, we propose a Visual Precision Search method that generates accurate attribution maps with fewer regions. Our method bypasses internal model parameters to overcome attribution issues from multimodal fusion, dividing inputs into sparse sub-regions and using consistency and collaboration scores to accurately identify critical decision-making regions. We also conducted a theoretical analysis of the boundary guarantees and scope of applicability of our method. Experiments on RefCOCO, MS COCO, and LVIS show our approach enhances object-level task interpretability over SOTA for Grounding DINO and Florence-2 across various evaluation metrics, with faithfulness gains of 23.7\%, 31.6\%, and 20.1\% on MS COCO, LVIS, and RefCOCO for Grounding DINO, and 102.9\% and 66.9\% on MS COCO and RefCOCO for Florence-2. Additionally, our method can interpret failures in visual grounding and object detection tasks, surpassing existing methods across multiple evaluation metrics. The code will be released at \url{https://github.com/RuoyuChen10/VPS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16198v1-abstract-full').style.display = 'none'; document.getElementById('2411.16198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15944">arXiv:2411.15944</a> <span> [<a href="https://arxiv.org/pdf/2411.15944">pdf</a>, <a href="https://arxiv.org/format/2411.15944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Customer Lifetime Value Prediction with Uncertainty Estimation Using Monte Carlo Dropout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xinzhe Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yadong Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15944v1-abstract-short" style="display: inline;"> Accurately predicting customer Lifetime Value (LTV) is crucial for companies to optimize their revenue strategies. Traditional deep learning models for LTV prediction are effective but typically provide only point estimates and fail to capture model uncertainty in modeling user behaviors. To address this limitation, we propose a novel approach that enhances the architecture of purely neural networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15944v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15944v1-abstract-full" style="display: none;"> Accurately predicting customer Lifetime Value (LTV) is crucial for companies to optimize their revenue strategies. Traditional deep learning models for LTV prediction are effective but typically provide only point estimates and fail to capture model uncertainty in modeling user behaviors. To address this limitation, we propose a novel approach that enhances the architecture of purely neural network models by incorporating the Monte Carlo Dropout (MCD) framework. We benchmarked the proposed method using data from one of the most downloaded mobile games in the world, and demonstrated a substantial improvement in predictive Top 5\% Mean Absolute Percentage Error compared to existing state-of-the-art methods. Additionally, our approach provides confidence metric as an extra dimension for performance evaluation across various neural network models, facilitating more informed business decisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15944v1-abstract-full').style.display = 'none'; document.getElementById('2411.15944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15604">arXiv:2411.15604</a> <span> [<a href="https://arxiv.org/pdf/2411.15604">pdf</a>, <a href="https://arxiv.org/format/2411.15604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FATE: Full-head Gaussian Avatar with Textural Editing from Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zijian Wu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhiyang Liang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yicheng Gong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dongfang Hu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15604v1-abstract-short" style="display: inline;"> Reconstructing high-fidelity, animatable 3D head avatars from effortlessly captured monocular videos is a pivotal yet formidable challenge. Although significant progress has been made in rendering performance and manipulation capabilities, notable challenges remain, including incomplete reconstruction and inefficient Gaussian representation. To address these challenges, we introduce FATE, a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15604v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15604v1-abstract-full" style="display: none;"> Reconstructing high-fidelity, animatable 3D head avatars from effortlessly captured monocular videos is a pivotal yet formidable challenge. Although significant progress has been made in rendering performance and manipulation capabilities, notable challenges remain, including incomplete reconstruction and inefficient Gaussian representation. To address these challenges, we introduce FATE, a novel method for reconstructing an editable full-head avatar from a single monocular video. FATE integrates a sampling-based densification strategy to ensure optimal positional distribution of points, improving rendering efficiency. A neural baking technique is introduced to convert discrete Gaussian representations into continuous attribute maps, facilitating intuitive appearance editing. Furthermore, we propose a universal completion framework to recover non-frontal appearance, culminating in a 360$^\circ$-renderable 3D head avatar. FATE outperforms previous approaches in both qualitative and quantitative evaluations, achieving state-of-the-art performance. To the best of our knowledge, FATE is the first animatable and 360$^\circ$ full-head monocular reconstruction method for a 3D head avatar. The code will be publicly released upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15604v1-abstract-full').style.display = 'none'; document.getElementById('2411.15604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://zjwfufu.github.io/FATE-page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15497">arXiv:2411.15497</a> <span> [<a href="https://arxiv.org/pdf/2411.15497">pdf</a>, <a href="https://arxiv.org/format/2411.15497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AeroGen: Enhancing Remote Sensing Object Detection with Diffusion-Driven Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+D">Datao Tang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xuan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jing Yao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xueru Bai</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15497v2-abstract-short" style="display: inline;"> Remote sensing image object detection (RSIOD) aims to identify and locate specific objects within satellite or aerial imagery. However, there is a scarcity of labeled data in current RSIOD datasets, which significantly limits the performance of current detection algorithms. Although existing techniques, e.g., data augmentation and semi-supervised learning, can mitigate this scarcity issue to some… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15497v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15497v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15497v2-abstract-full" style="display: none;"> Remote sensing image object detection (RSIOD) aims to identify and locate specific objects within satellite or aerial imagery. However, there is a scarcity of labeled data in current RSIOD datasets, which significantly limits the performance of current detection algorithms. Although existing techniques, e.g., data augmentation and semi-supervised learning, can mitigate this scarcity issue to some extent, they are heavily dependent on high-quality labeled data and perform worse in rare object classes. To address this issue, this paper proposes a layout-controllable diffusion generative model (i.e. AeroGen) tailored for RSIOD. To our knowledge, AeroGen is the first model to simultaneously support horizontal and rotated bounding box condition generation, thus enabling the generation of high-quality synthetic images that meet specific layout and object category requirements. Additionally, we propose an end-to-end data augmentation framework that integrates a diversity-conditioned generator and a filtering mechanism to enhance both the diversity and quality of generated data. Experimental results demonstrate that the synthetic data produced by our method are of high quality and diversity. Furthermore, the synthetic RSIOD data can significantly improve the detection performance of existing RSIOD models, i.e., the mAP metrics on DIOR, DIOR-R, and HRSC datasets are improved by 3.7%, 4.3%, and 2.43%, respectively. The code is available at https://github.com/Sonettoo/AeroGen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15497v2-abstract-full').style.display = 'none'; document.getElementById('2411.15497v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14630">arXiv:2411.14630</a> <span> [<a href="https://arxiv.org/pdf/2411.14630">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ACE-Net: AutofoCus-Enhanced Convolutional Network for Field Imperfection Estimation with application to high b-value spiral Diffusion MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mengze Gao</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+Z">Zachary Shah</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaozhi Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nan Wang</a>, <a href="/search/cs?searchtype=author&query=Abraham%2C+D">Daniel Abraham</a>, <a href="/search/cs?searchtype=author&query=Setsompop%2C+K">Kawin Setsompop</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14630v1-abstract-short" style="display: inline;"> Spatiotemporal magnetic field variations from B0-inhomogeneity and diffusion-encoding-induced eddy-currents can be detrimental to rapid image-encoding schemes such as spiral, EPI and 3D-cones, resulting in undesirable image artifacts. In this work, a data driven approach for automatic estimation of these field imperfections is developed by combining autofocus metrics with deep learning, and by lev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14630v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14630v1-abstract-full" style="display: none;"> Spatiotemporal magnetic field variations from B0-inhomogeneity and diffusion-encoding-induced eddy-currents can be detrimental to rapid image-encoding schemes such as spiral, EPI and 3D-cones, resulting in undesirable image artifacts. In this work, a data driven approach for automatic estimation of these field imperfections is developed by combining autofocus metrics with deep learning, and by leveraging a compact basis representation of the expected field imperfections. The method was applied to single-shot spiral diffusion MRI at high b-values where accurate estimation of B0 and eddy were obtained, resulting in high quality image reconstruction without need for additional external calibrations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14630v1-abstract-full').style.display = 'none'; document.getElementById('2411.14630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, submitted to International Society for Magnetic Resonance in Medicine 32th Scientific Meeting, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14001">arXiv:2411.14001</a> <span> [<a href="https://arxiv.org/pdf/2411.14001">pdf</a>, <a href="https://arxiv.org/format/2411.14001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Graph Domain Adaptation with Dual-branch Encoder and Two-level Alignment for Whole Slide Image-based Survival Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shou%2C+Y">Yuntao Shou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Peiqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingjian Yuan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qian Zhao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14001v1-abstract-short" style="display: inline;"> In recent years, histopathological whole slide image (WSI)- based survival analysis has attracted much attention in medical image analysis. In practice, WSIs usually come from different hospitals or laboratories, which can be seen as different domains, and thus may have significant differences in imaging equipment, processing procedures, and sample sources. These differences generally result in la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14001v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14001v1-abstract-full" style="display: none;"> In recent years, histopathological whole slide image (WSI)- based survival analysis has attracted much attention in medical image analysis. In practice, WSIs usually come from different hospitals or laboratories, which can be seen as different domains, and thus may have significant differences in imaging equipment, processing procedures, and sample sources. These differences generally result in large gaps in distribution between different WSI domains, and thus the survival analysis models trained on one domain may fail to transfer to another. To address this issue, we propose a Dual-branch Encoder and Two-level Alignment (DETA) framework to explore both feature and category-level alignment between different WSI domains. Specifically, we first formulate the concerned problem as graph domain adaptation (GDA) by virtue the graph representation of WSIs. Then we construct a dual-branch graph encoder, including the message passing branch and the shortest path branch, to explicitly and implicitly extract semantic information from the graph-represented WSIs. To realize GDA, we propose a two-level alignment approach: at the category level, we develop a coupling technique by virtue of the dual-branch structure, leading to reduced divergence between the category distributions of the two domains; at the feature level, we introduce an adversarial perturbation strategy to better augment source domain feature, resulting in improved alignment in feature distribution. To the best of our knowledge, our work is the first attempt to alleviate the domain shift issue for WSI data analysis. Extensive experiments on four TCGA datasets have validated the effectiveness of our proposed DETA framework and demonstrated its superior performance in WSI-based survival analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14001v1-abstract-full').style.display = 'none'; document.getElementById('2411.14001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11943">arXiv:2411.11943</a> <span> [<a href="https://arxiv.org/pdf/2411.11943">pdf</a>, <a href="https://arxiv.org/format/2411.11943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Medical Video Generation for Disease Progression Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kaizhao Liang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+K">Kuei-Da Liao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianren Gao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhiguang Ding</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jianguo Cao</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jimeng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11943v1-abstract-short" style="display: inline;"> Modeling disease progression is crucial for improving the quality and efficacy of clinical diagnosis and prognosis, but it is often hindered by a lack of longitudinal medical image monitoring for individual patients. To address this challenge, we propose the first Medical Video Generation (MVG) framework that enables controlled manipulation of disease-related image and video features, allowing pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11943v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11943v1-abstract-full" style="display: none;"> Modeling disease progression is crucial for improving the quality and efficacy of clinical diagnosis and prognosis, but it is often hindered by a lack of longitudinal medical image monitoring for individual patients. To address this challenge, we propose the first Medical Video Generation (MVG) framework that enables controlled manipulation of disease-related image and video features, allowing precise, realistic, and personalized simulations of disease progression. Our approach begins by leveraging large language models (LLMs) to recaption prompt for disease trajectory. Next, a controllable multi-round diffusion model simulates the disease progression state for each patient, creating realistic intermediate disease state sequence. Finally, a diffusion-based video transition generation model interpolates disease progression between these states. We validate our framework across three medical imaging domains: chest X-ray, fundus photography, and skin image. Our results demonstrate that MVG significantly outperforms baseline models in generating coherent and clinically plausible disease trajectories. Two user studies by veteran physicians, provide further validation and insights into the clinical utility of the generated sequences. MVG has the potential to assist healthcare providers in modeling disease trajectories, interpolating missing medical image data, and enhancing medical education through realistic, dynamic visualizations of disease progression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11943v1-abstract-full').style.display = 'none'; document.getElementById('2411.11943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report. The appendix will release soon. arXiv admin note: text overlap with arXiv:2309.11745</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11913">arXiv:2411.11913</a> <span> [<a href="https://arxiv.org/pdf/2411.11913">pdf</a>, <a href="https://arxiv.org/format/2411.11913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> On-Board Vision-Language Models for Personalized Autonomous Vehicle Motion Control: System Design and Real-World Validation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zichong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yupeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Juntong Peng</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sung-Yeon Park</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunsheng Ma</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yiheng Feng</a>, <a href="/search/cs?searchtype=author&query=Panchal%2C+J">Jitesh Panchal</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingxi Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaobin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziran Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11913v1-abstract-short" style="display: inline;"> Personalized driving refers to an autonomous vehicle's ability to adapt its driving behavior or control strategies to match individual users' preferences and driving styles while maintaining safety and comfort standards. However, existing works either fail to capture every individual preference precisely or become computationally inefficient as the user base expands. Vision-Language Models (VLMs)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11913v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11913v1-abstract-full" style="display: none;"> Personalized driving refers to an autonomous vehicle's ability to adapt its driving behavior or control strategies to match individual users' preferences and driving styles while maintaining safety and comfort standards. However, existing works either fail to capture every individual preference precisely or become computationally inefficient as the user base expands. Vision-Language Models (VLMs) offer promising solutions to this front through their natural language understanding and scene reasoning capabilities. In this work, we propose a lightweight yet effective on-board VLM framework that provides low-latency personalized driving performance while maintaining strong reasoning capabilities. Our solution incorporates a Retrieval-Augmented Generation (RAG)-based memory module that enables continuous learning of individual driving preferences through human feedback. Through comprehensive real-world vehicle deployment and experiments, our system has demonstrated the ability to provide safe, comfortable, and personalized driving experiences across various scenarios and significantly reduce takeover rates by up to 76.9%. To the best of our knowledge, this work represents the first end-to-end VLM-based motion control system in real-world autonomous vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11913v1-abstract-full').style.display = 'none'; document.getElementById('2411.11913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10888">arXiv:2411.10888</a> <span> [<a href="https://arxiv.org/pdf/2411.10888">pdf</a>, <a href="https://arxiv.org/format/2411.10888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MpoxVLM: A Vision-Language Model for Diagnosing Skin Lesions from Mpox Virus Infection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Moise%2C+K">Kenny Moise</a>, <a href="/search/cs?searchtype=author&query=Coffee%2C+M">Megan Coffee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10888v1-abstract-short" style="display: inline;"> In the aftermath of the COVID-19 pandemic and amid accelerating climate change, emerging infectious diseases, particularly those arising from zoonotic spillover, remain a global threat. Mpox (caused by the monkeypox virus) is a notable example of a zoonotic infection that often goes undiagnosed, especially as its rash progresses through stages, complicating detection across diverse populations wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10888v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10888v1-abstract-full" style="display: none;"> In the aftermath of the COVID-19 pandemic and amid accelerating climate change, emerging infectious diseases, particularly those arising from zoonotic spillover, remain a global threat. Mpox (caused by the monkeypox virus) is a notable example of a zoonotic infection that often goes undiagnosed, especially as its rash progresses through stages, complicating detection across diverse populations with different presentations. In August 2024, the WHO Director-General declared the mpox outbreak a public health emergency of international concern for a second time. Despite the deployment of deep learning techniques for detecting diseases from skin lesion images, a robust and publicly accessible foundation model for mpox diagnosis is still lacking due to the unavailability of open-source mpox skin lesion images, multimodal clinical data, and specialized training pipelines. To address this gap, we propose MpoxVLM, a vision-language model (VLM) designed to detect mpox by analyzing both skin lesion images and patient clinical information. MpoxVLM integrates the CLIP visual encoder, an enhanced Vision Transformer (ViT) classifier for skin lesions, and LLaMA-2-7B models, pre-trained and fine-tuned on visual instruction-following question-answer pairs from our newly released mpox skin lesion dataset. Our work achieves 90.38% accuracy for mpox detection, offering a promising pathway to improve early diagnostic accuracy in combating mpox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10888v1-abstract-full').style.display = 'none'; document.getElementById('2411.10888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ML4H 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09209">arXiv:2411.09209</a> <span> [<a href="https://arxiv.org/pdf/2411.09209">pdf</a>, <a href="https://arxiv.org/format/2411.09209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JoyVASA: Portrait and Animal Image Animation with Diffusion-Based Audio-Driven Facial Dynamics and Head Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuyang Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoxin Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sheng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yang Yao</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+J">Jintao Fei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09209v3-abstract-short" style="display: inline;"> Audio-driven portrait animation has made significant advances with diffusion-based models, improving video quality and lipsync accuracy. However, the increasing complexity of these models has led to inefficiencies in training and inference, as well as constraints on video length and inter-frame continuity. In this paper, we propose JoyVASA, a diffusion-based method for generating facial dynamics a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09209v3-abstract-full').style.display = 'inline'; document.getElementById('2411.09209v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09209v3-abstract-full" style="display: none;"> Audio-driven portrait animation has made significant advances with diffusion-based models, improving video quality and lipsync accuracy. However, the increasing complexity of these models has led to inefficiencies in training and inference, as well as constraints on video length and inter-frame continuity. In this paper, we propose JoyVASA, a diffusion-based method for generating facial dynamics and head motion in audio-driven facial animation. Specifically, in the first stage, we introduce a decoupled facial representation framework that separates dynamic facial expressions from static 3D facial representations. This decoupling allows the system to generate longer videos by combining any static 3D facial representation with dynamic motion sequences. Then, in the second stage, a diffusion transformer is trained to generate motion sequences directly from audio cues, independent of character identity. Finally, a generator trained in the first stage uses the 3D facial representation and the generated motion sequences as inputs to render high-quality animations. With the decoupled facial representation and the identity-independent motion generation process, JoyVASA extends beyond human portraits to animate animal faces seamlessly. The model is trained on a hybrid dataset of private Chinese and public English data, enabling multilingual support. Experimental results validate the effectiveness of our approach. Future work will focus on improving real-time performance and refining expression control, further expanding the applications in portrait animation. The code is available at: https://github.com/jdh-algo/JoyVASA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09209v3-abstract-full').style.display = 'none'; document.getElementById('2411.09209v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08299">arXiv:2411.08299</a> <span> [<a href="https://arxiv.org/pdf/2411.08299">pdf</a>, <a href="https://arxiv.org/format/2411.08299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DNN Task Assignment in UAV Networks: A Generative AI Enhanced Multi-Agent Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xin Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+W">Wenjie Weng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+B">Binhan Liao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xianbin Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08299v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicles (UAVs) possess high mobility and flexible deployment capabilities, prompting the development of UAVs for various application scenarios within the Internet of Things (IoT). The unique capabilities of UAVs give rise to increasingly critical and complex tasks in uncertain and potentially harsh environments. The substantial amount of data generated from these applications nece… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08299v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08299v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicles (UAVs) possess high mobility and flexible deployment capabilities, prompting the development of UAVs for various application scenarios within the Internet of Things (IoT). The unique capabilities of UAVs give rise to increasingly critical and complex tasks in uncertain and potentially harsh environments. The substantial amount of data generated from these applications necessitates processing and analysis through deep neural networks (DNNs). However, UAVs encounter challenges due to their limited computing resources when managing DNN models. This paper presents a joint approach that combines multiple-agent reinforcement learning (MARL) and generative diffusion models (GDM) for assigning DNN tasks to a UAV swarm, aimed at reducing latency from task capture to result output. To address these challenges, we first consider the task size of the target area to be inspected and the shortest flying path as optimization constraints, employing a greedy algorithm to resolve the subproblem with a focus on minimizing the UAV's flying path and the overall system cost. In the second stage, we introduce a novel DNN task assignment algorithm, termed GDM-MADDPG, which utilizes the reverse denoising process of GDM to replace the actor network in multi-agent deep deterministic policy gradient (MADDPG). This approach generates specific DNN task assignment actions based on agents' observations in a dynamic environment. Simulation results indicate that our algorithm performs favorably compared to benchmarks in terms of path planning, Age of Information (AoI), energy consumption, and task load balancing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08299v1-abstract-full').style.display = 'none'; document.getElementById('2411.08299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06172">arXiv:2411.06172</a> <span> [<a href="https://arxiv.org/pdf/2411.06172">pdf</a>, <a href="https://arxiv.org/format/2411.06172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> IDU-Detector: A Synergistic Framework for Robust Masquerader Attack Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zilin Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiulai Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xinyi Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Ke Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longjuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L+B">Logan Bo-Yee Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06172v1-abstract-short" style="display: inline;"> In the digital age, users store personal data in corporate databases, making data security central to enterprise management. Given the extensive attack surface, assets face challenges like weak authentication, vulnerabilities, and malware. Attackers may exploit vulnerabilities to gain unauthorized access, masquerading as legitimate users. Such attacks can lead to privacy breaches, business disrupt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06172v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06172v1-abstract-full" style="display: none;"> In the digital age, users store personal data in corporate databases, making data security central to enterprise management. Given the extensive attack surface, assets face challenges like weak authentication, vulnerabilities, and malware. Attackers may exploit vulnerabilities to gain unauthorized access, masquerading as legitimate users. Such attacks can lead to privacy breaches, business disruption, financial losses, and reputational damage. Complex attack vectors blur lines between insider and external threats. To address this, we introduce the IDU-Detector, integrating Intrusion Detection Systems (IDS) with User and Entity Behavior Analytics (UEBA). This integration monitors unauthorized access, bridges system gaps, ensures continuous monitoring, and enhances threat identification. Existing insider threat datasets lack depth and coverage of diverse attack vectors. This hinders detection technologies from addressing complex attack surfaces. We propose new, diverse datasets covering more attack scenarios, enhancing detection technologies. Testing our framework, the IDU-Detector achieved average accuracies of 98.96% and 99.12%. These results show effectiveness in detecting attacks, improving security and response speed, and providing higher asset safety assurance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06172v1-abstract-full').style.display = 'none'; document.getElementById('2411.06172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02669">arXiv:2411.02669</a> <span> [<a href="https://arxiv.org/pdf/2411.02669">pdf</a>, <a href="https://arxiv.org/format/2411.02669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic-Aligned Adversarial Evolution Triangle for High-Transferability Vision-Language Attack </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Sensen Gao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Ke Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihao Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Simeng Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Fellow%2C+I+T">Ivor Tsang Fellow</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02669v1-abstract-short" style="display: inline;"> Vision-language pre-training (VLP) models excel at interpreting both images and text but remain vulnerable to multimodal adversarial examples (AEs). Advancing the generation of transferable AEs, which succeed across unseen models, is key to developing more robust and practical VLP models. Previous approaches augment image-text pairs to enhance diversity within the adversarial example generation pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02669v1-abstract-full" style="display: none;"> Vision-language pre-training (VLP) models excel at interpreting both images and text but remain vulnerable to multimodal adversarial examples (AEs). Advancing the generation of transferable AEs, which succeed across unseen models, is key to developing more robust and practical VLP models. Previous approaches augment image-text pairs to enhance diversity within the adversarial example generation process, aiming to improve transferability by expanding the contrast space of image-text features. However, these methods focus solely on diversity around the current AEs, yielding limited gains in transferability. To address this issue, we propose to increase the diversity of AEs by leveraging the intersection regions along the adversarial trajectory during optimization. Specifically, we propose sampling from adversarial evolution triangles composed of clean, historical, and current adversarial examples to enhance adversarial diversity. We provide a theoretical analysis to demonstrate the effectiveness of the proposed adversarial evolution triangle. Moreover, we find that redundant inactive dimensions can dominate similarity calculations, distorting feature matching and making AEs model-dependent with reduced transferability. Hence, we propose to generate AEs in the semantic image-text feature contrast space, which can project the original feature space into a semantic corpus subspace. The proposed semantic-aligned subspace can reduce the image feature redundancy, thereby improving adversarial transferability. Extensive experiments across different datasets and models demonstrate that the proposed method can effectively improve adversarial transferability and outperform state-of-the-art adversarial attack methods. The code is released at https://github.com/jiaxiaojunQAQ/SA-AET. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02669v1-abstract-full').style.display = 'none'; document.getElementById('2411.02669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01906">arXiv:2411.01906</a> <span> [<a href="https://arxiv.org/pdf/2411.01906">pdf</a>, <a href="https://arxiv.org/format/2411.01906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Connection Performance Modeling and Analysis of a Radiosonde Network in a Typhoon </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanyi Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xianbin Cao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Peng Yang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/cs?searchtype=author&query=Quek%2C+T+Q+S">Tony Q. S. Quek</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D+O">Dapeng Oliver Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01906v3-abstract-short" style="display: inline;"> This paper is concerned with the theoretical modeling and analysis of uplink connection performance of a radiosonde network deployed in a typhoon. Similar to existing works, the stochastic geometry theory is leveraged to derive the expression of the uplink connection probability (CP) of a radiosonde. Nevertheless, existing works assume that network nodes are spherically or uniformly distributed. D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01906v3-abstract-full').style.display = 'inline'; document.getElementById('2411.01906v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01906v3-abstract-full" style="display: none;"> This paper is concerned with the theoretical modeling and analysis of uplink connection performance of a radiosonde network deployed in a typhoon. Similar to existing works, the stochastic geometry theory is leveraged to derive the expression of the uplink connection probability (CP) of a radiosonde. Nevertheless, existing works assume that network nodes are spherically or uniformly distributed. Different from the existing works, this paper investigates two particular motion patterns of radiosondes in a typhoon, which significantly challenges the theoretical analysis. According to their particular motion patterns, this paper first separately models the distributions of horizontal and vertical distances from a radiosonde to its receiver. Secondly, this paper derives the closed-form expressions of cumulative distribution function (CDF) and probability density function (PDF) of a radiosonde's three-dimensional (3D) propagation distance to its receiver. Thirdly, this paper derives the analytical expression of the uplink CP for any radiosonde in the network. Finally, extensive numerical simulations are conducted to validate the theoretical analysis, and the influence of various network design parameters are comprehensively discussed. Simulation results show that when the signal-to-interference-noise ratio (SINR) threshold is below -35 dB, and the density of radiosondes remains under 0.01/km^3, the uplink CP approaches 26%, 39%, and 50% in three patterns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01906v3-abstract-full').style.display = 'none'; document.getElementById('2411.01906v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01779">arXiv:2411.01779</a> <span> [<a href="https://arxiv.org/pdf/2411.01779">pdf</a>, <a href="https://arxiv.org/format/2411.01779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> TabSec: A Collaborative Framework for Novel Insider Threat Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zilin Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangyan Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongyu Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xinyi Cao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jieren Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01779v1-abstract-short" style="display: inline;"> In the era of the Internet of Things (IoT) and data sharing, users frequently upload their personal information to enterprise databases to enjoy enhanced service experiences provided by various online services. However, the widespread presence of system vulnerabilities, remote network intrusions, and insider threats significantly increases the exposure of private enterprise data on the internet. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01779v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01779v1-abstract-full" style="display: none;"> In the era of the Internet of Things (IoT) and data sharing, users frequently upload their personal information to enterprise databases to enjoy enhanced service experiences provided by various online services. However, the widespread presence of system vulnerabilities, remote network intrusions, and insider threats significantly increases the exposure of private enterprise data on the internet. If such data is stolen or leaked by attackers, it can result in severe asset losses and business operation disruptions. To address these challenges, this paper proposes a novel threat detection framework, TabITD. This framework integrates Intrusion Detection Systems (IDS) with User and Entity Behavior Analytics (UEBA) strategies to form a collaborative detection system that bridges the gaps in existing systems' capabilities. It effectively addresses the blurred boundaries between external and insider threats caused by the diversification of attack methods, thereby enhancing the model's learning ability and overall detection performance. Moreover, the proposed method leverages the TabNet architecture, which employs a sparse attention feature selection mechanism that allows TabNet to select the most relevant features at each decision step, thereby improving the detection of rare-class attacks. We evaluated our proposed solution on two different datasets, achieving average accuracies of 96.71% and 97.25%, respectively. The results demonstrate that this approach can effectively detect malicious behaviors such as masquerade attacks and external threats, significantly enhancing network security defenses and the efficiency of network attack detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01779v1-abstract-full').style.display = 'none'; document.getElementById('2411.01779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01499">arXiv:2411.01499</a> <span> [<a href="https://arxiv.org/pdf/2411.01499">pdf</a>, <a href="https://arxiv.org/format/2411.01499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Polar R-CNN: End-to-End Lane Detection with Fewer Anchors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengqi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junmin Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zengjie Song</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+K">Kai Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01499v1-abstract-short" style="display: inline;"> Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes can be slender, lengthy, and often obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior lane anchors to extract features and subsequently refine the location and shape of lanes. While these methods achieve hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01499v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01499v1-abstract-full" style="display: none;"> Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes can be slender, lengthy, and often obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior lane anchors to extract features and subsequently refine the location and shape of lanes. While these methods achieve high performance, manually setting prior anchors is cumbersome, and ensuring sufficient coverage across diverse datasets often requires a large amount of dense anchors. Furthermore, the use of Non-Maximum Suppression (NMS) to eliminate redundant predictions complicates real-world deployment and may underperform in complex scenarios. In this paper, we propose Polar R-CNN, an end-to-end anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, Polar R-CNN facilitates flexible anchor proposals and significantly reduces the number of anchors required without compromising performance.Additionally, we introduce a triplet head with heuristic structure that supports NMS-free paradigm, enhancing deployment efficiency and performance in scenarios with dense lanes.Our method achieves competitive results on five popular lane detection benchmarks--Tusimple, CULane,LLAMAS, CurveLanes, and DL-Rai--while maintaining a lightweight design and straightforward structure. Our source code is available at https://github.com/ShqWW/PolarRCNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01499v1-abstract-full').style.display = 'none'; document.getElementById('2411.01499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00453">arXiv:2411.00453</a> <span> [<a href="https://arxiv.org/pdf/2411.00453">pdf</a>, <a href="https://arxiv.org/format/2411.00453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Models as Network Optimizers: Explorations and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruihuai Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xianjin Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yifan Xue</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuelin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">M茅rouane Debbah</a>, <a href="/search/cs?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00453v2-abstract-short" style="display: inline;"> Network optimization is a fundamental challenge in the Internet of Things (IoT) network, often characterized by complex features that make it difficult to solve these problems. Recently, generative diffusion models (GDMs) have emerged as a promising new approach to network optimization, with the potential to directly address these optimization problems. However, the application of GDMs in this fie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00453v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00453v2-abstract-full" style="display: none;"> Network optimization is a fundamental challenge in the Internet of Things (IoT) network, often characterized by complex features that make it difficult to solve these problems. Recently, generative diffusion models (GDMs) have emerged as a promising new approach to network optimization, with the potential to directly address these optimization problems. However, the application of GDMs in this field is still in its early stages, and there is a noticeable lack of theoretical research and empirical findings. In this study, we first explore the intrinsic characteristics of generative models. Next, we provide a concise theoretical proof and intuitive demonstration of the advantages of generative models over discriminative models in network optimization. Based on this exploration, we implement GDMs as optimizers aimed at learning high-quality solution distributions for given inputs, sampling from these distributions during inference to approximate or achieve optimal solutions. Specifically, we utilize denoising diffusion probabilistic models (DDPMs) and employ a classifier-free guidance mechanism to manage conditional guidance based on input parameters. We conduct extensive experiments across three challenging network optimization problems. By investigating various model configurations and the principles of GDMs as optimizers, we demonstrate the ability to overcome prediction errors and validate the convergence of generated solutions to optimal solutions.We provide code and data at https://github.com/qiyu3816/DiffSG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00453v2-abstract-full').style.display = 'none'; document.getElementById('2411.00453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23946">arXiv:2410.23946</a> <span> [<a href="https://arxiv.org/pdf/2410.23946">pdf</a>, <a href="https://arxiv.org/format/2410.23946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MV-CC: Mask Enhanced Video Model for Remote Sensing Change Caption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruixun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiyu Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiayi Song</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Dongwei Sun</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23946v1-abstract-short" style="display: inline;"> Remote sensing image change caption (RSICC) aims to provide natural language descriptions for bi-temporal remote sensing images. Since Change Caption (CC) task requires both spatial and temporal features, previous works follow an encoder-fusion-decoder architecture. They use an image encoder to extract spatial features and the fusion module to integrate spatial features and extract temporal featur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23946v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23946v1-abstract-full" style="display: none;"> Remote sensing image change caption (RSICC) aims to provide natural language descriptions for bi-temporal remote sensing images. Since Change Caption (CC) task requires both spatial and temporal features, previous works follow an encoder-fusion-decoder architecture. They use an image encoder to extract spatial features and the fusion module to integrate spatial features and extract temporal features, which leads to increasingly complex manual design of the fusion module. In this paper, we introduce a novel video model-based paradigm without design of the fusion module and propose a Mask-enhanced Video model for Change Caption (MV-CC). Specifically, we use the off-the-shelf video encoder to simultaneously extract the temporal and spatial features of bi-temporal images. Furthermore, the types of changes in the CC are set based on specific task requirements, and to enable the model to better focus on the regions of interest, we employ masks obtained from the Change Detection (CD) method to explicitly guide the CC model. Experimental results demonstrate that our proposed method can obtain better performance compared with other state-of-the-art RSICC methods. The code is available at https://github.com/liuruixun/MV-CC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23946v1-abstract-full').style.display = 'none'; document.getElementById('2410.23946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22959">arXiv:2410.22959</a> <span> [<a href="https://arxiv.org/pdf/2410.22959">pdf</a>, <a href="https://arxiv.org/format/2410.22959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EnsIR: An Ensemble Algorithm for Image Restoration via Gaussian Mixture Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shangquan Sun</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Wenqi Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zikun Liu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H">Hyunhee Park</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22959v1-abstract-short" style="display: inline;"> Image restoration has experienced significant advancements due to the development of deep learning. Nevertheless, it encounters challenges related to ill-posed problems, resulting in deviations between single model predictions and ground-truths. Ensemble learning, as a powerful machine learning technique, aims to address these deviations by combining the predictions of multiple base models. Most e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22959v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22959v1-abstract-full" style="display: none;"> Image restoration has experienced significant advancements due to the development of deep learning. Nevertheless, it encounters challenges related to ill-posed problems, resulting in deviations between single model predictions and ground-truths. Ensemble learning, as a powerful machine learning technique, aims to address these deviations by combining the predictions of multiple base models. Most existing works adopt ensemble learning during the design of restoration models, while only limited research focuses on the inference-stage ensemble of pre-trained restoration models. Regression-based methods fail to enable efficient inference, leading researchers in academia and industry to prefer averaging as their choice for post-training ensemble. To address this, we reformulate the ensemble problem of image restoration into Gaussian mixture models (GMMs) and employ an expectation maximization (EM)-based algorithm to estimate ensemble weights for aggregating prediction candidates. We estimate the range-wise ensemble weights on a reference set and store them in a lookup table (LUT) for efficient ensemble inference on the test set. Our algorithm is model-agnostic and training-free, allowing seamless integration and enhancement of various pre-trained image restoration models. It consistently outperforms regression based methods and averaging ensemble approaches on 14 benchmarks across 3 image restoration tasks, including super-resolution, deblurring and deraining. The codes and all estimated weights have been released in Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22959v1-abstract-full').style.display = 'none'; document.getElementById('2410.22959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages for main manuscript, additional 17 pages for appendix, 18 figures, 17MB</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21804">arXiv:2410.21804</a> <span> [<a href="https://arxiv.org/pdf/2410.21804">pdf</a>, <a href="https://arxiv.org/format/2410.21804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Effective Weight-Ensembling Mixture of Experts for Multi-Task Model Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+A">Anke Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+E">Enneng Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Guibing Guo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yong Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lefei Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21804v1-abstract-short" style="display: inline;"> Multi-task learning (MTL) leverages a shared model to accomplish multiple tasks and facilitate knowledge transfer. Recent research on task arithmetic-based MTL demonstrates that merging the parameters of independently fine-tuned models can effectively achieve MTL. However, existing merging methods primarily seek a static optimal solution within the original model parameter space, which often resul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21804v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21804v1-abstract-full" style="display: none;"> Multi-task learning (MTL) leverages a shared model to accomplish multiple tasks and facilitate knowledge transfer. Recent research on task arithmetic-based MTL demonstrates that merging the parameters of independently fine-tuned models can effectively achieve MTL. However, existing merging methods primarily seek a static optimal solution within the original model parameter space, which often results in performance degradation due to the inherent diversity among tasks and potential interferences. To address this challenge, in this paper, we propose a Weight-Ensembling Mixture of Experts (WEMoE) method for multi-task model merging. Specifically, we first identify critical (or sensitive) modules by analyzing parameter variations in core modules of Transformer-based models before and after finetuning. Then, our WEMoE statically merges non-critical modules while transforming critical modules into a mixture-of-experts (MoE) structure. During inference, expert modules in the MoE are dynamically merged based on input samples, enabling a more flexible and adaptive merging approach. Building on WEMoE, we further introduce an efficient-and-effective WEMoE (E-WEMoE) method, whose core mechanism involves eliminating non-essential elements in the critical modules of WEMoE and implementing shared routing across multiple MoE modules, thereby significantly reducing both the trainable parameters, the overall parameter count, and computational overhead of the merged model by WEMoE. Experimental results across various architectures and tasks demonstrate that both WEMoE and E-WEMoE outperform state-of-the-art (SOTA) model merging methods in terms of MTL performance, generalization, and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21804v1-abstract-full').style.display = 'none'; document.getElementById('2410.21804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21017">arXiv:2410.21017</a> <span> [<a href="https://arxiv.org/pdf/2410.21017">pdf</a>, <a href="https://arxiv.org/format/2410.21017">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Edge Perception: Intelligent Wireless Sensing at Network Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuanhao Cui</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaowen Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiali Nie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21017v1-abstract-short" style="display: inline;"> Future sixth-generation (6G) networks are envisioned to support intelligent applications across various vertical scenarios, which have stringent requirements on high-precision sensing as well as ultra-low-latency data processing and decision making. Towards this end, a new paradigm of edge perception networks emerges, which integrates wireless sensing, communication, computation, and artificial in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21017v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21017v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21017v1-abstract-full" style="display: none;"> Future sixth-generation (6G) networks are envisioned to support intelligent applications across various vertical scenarios, which have stringent requirements on high-precision sensing as well as ultra-low-latency data processing and decision making. Towards this end, a new paradigm of edge perception networks emerges, which integrates wireless sensing, communication, computation, and artificial intelligence (AI) capabilities at network edge for intelligent sensing and data processing. This article provides a timely overview on this emerging topic. We commence by discussing wireless edge perception, including physical layer transceiver design, network-wise cooperation, and application-specific data analytics, for which the prospects and challenges are emphasized. Next, we discuss the interplay between edge AI and wireless sensing in edge perception, and present various key techniques for two paradigms, namely edge AI empowered sensing and task-oriented sensing for edge AI, respectively. Finally, we emphasize interesting research directions on edge perception to motivate future works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21017v1-abstract-full').style.display = 'none'; document.getElementById('2410.21017v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16954">arXiv:2410.16954</a> <span> [<a href="https://arxiv.org/pdf/2410.16954">pdf</a>, <a href="https://arxiv.org/format/2410.16954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> LoRA-C: Parameter-Efficient Fine-Tuning of Robust CNN for IoT Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chuntao Ding</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jianhang Xie</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Linlin Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangguang Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhichao Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16954v2-abstract-short" style="display: inline;"> Efficient fine-tuning of pre-trained convolutional neural network (CNN) models using local data is essential for providing high-quality services to users using ubiquitous and resource-limited Internet of Things (IoT) devices. Low-Rank Adaptation (LoRA) fine-tuning has attracted widespread attention from industry and academia because it is simple, efficient, and does not incur any additional reason… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16954v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16954v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16954v2-abstract-full" style="display: none;"> Efficient fine-tuning of pre-trained convolutional neural network (CNN) models using local data is essential for providing high-quality services to users using ubiquitous and resource-limited Internet of Things (IoT) devices. Low-Rank Adaptation (LoRA) fine-tuning has attracted widespread attention from industry and academia because it is simple, efficient, and does not incur any additional reasoning burden. However, most of the existing advanced methods use LoRA to fine-tune Transformer, and there are few studies on using LoRA to fine-tune CNN. The CNN model is widely deployed on IoT devices for application due to its advantages in comprehensive resource occupancy and performance. Moreover, IoT devices are widely deployed outdoors and usually process data affected by the environment (such as fog, snow, rain, etc.). The goal of this paper is to use LoRA technology to efficiently improve the robustness of the CNN model. To this end, this paper first proposes a strong, robust CNN fine-tuning method for IoT devices, LoRA-C, which performs low-rank decomposition in convolutional layers rather than kernel units to reduce the number of fine-tuning parameters. Then, this paper analyzes two different rank settings in detail and observes that the best performance is usually achieved when $伪/{r}$ is a constant in either standard data or corrupted data. This discovery provides experience for the widespread application of LoRA-C. Finally, this paper conducts many experiments based on pre-trained models. Experimental results on CIFAR-10, CIFAR-100, CIFAR-10-C, and Icons50 datasets show that the proposed LoRA-Cs outperforms standard ResNets. Specifically, on the CIFAR-10-C dataset, the accuracy of LoRA-C-ResNet-101 achieves 83.44% accuracy, surpassing the standard ResNet-101 result by +9.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16954v2-abstract-full').style.display = 'none'; document.getElementById('2410.16954v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 13 figures, https://github.com/alexyyds2024/lora-C</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16845">arXiv:2410.16845</a> <span> [<a href="https://arxiv.org/pdf/2410.16845">pdf</a>, <a href="https://arxiv.org/format/2410.16845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fast Graph Sharpness-Aware Minimization for Enhancing and Accelerating Few-Shot Node Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yihong Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+S">Siya Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jing Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16845v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have shown superior performance in node classification. However, GNNs perform poorly in the Few-Shot Node Classification (FSNC) task that requires robust generalization to make accurate predictions for unseen classes with limited labels. To tackle the challenge, we propose the integration of Sharpness-Aware Minimization (SAM)--a technique designed to enhance model gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16845v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16845v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have shown superior performance in node classification. However, GNNs perform poorly in the Few-Shot Node Classification (FSNC) task that requires robust generalization to make accurate predictions for unseen classes with limited labels. To tackle the challenge, we propose the integration of Sharpness-Aware Minimization (SAM)--a technique designed to enhance model generalization by finding a flat minimum of the loss landscape--into GNN training. The standard SAM approach, however, consists of two forward-backward steps in each training iteration, doubling the computational cost compared to the base optimizer (e.g., Adam). To mitigate this drawback, we introduce a novel algorithm, Fast Graph Sharpness-Aware Minimization (FGSAM), that integrates the rapid training of Multi-Layer Perceptrons (MLPs) with the superior performance of GNNs. Specifically, we utilize GNNs for parameter perturbation while employing MLPs to minimize the perturbed loss so that we can find a flat minimum with good generalization more efficiently. Moreover, our method reutilizes the gradient from the perturbation phase to incorporate graph topology into the minimization process at almost zero additional cost. To further enhance training efficiency, we develop FGSAM+ that executes exact perturbations periodically. Extensive experiments demonstrate that our proposed algorithm outperforms the standard SAM with lower computational costs in FSNC tasks. In particular, our FGSAM+ as a SAM variant offers a faster optimization than the base optimizer in most cases. In addition to FSNC, our proposed methods also demonstrate competitive performance in the standard node classification task for heterophilic graphs, highlighting the broad applicability. The code is available at https://github.com/draym28/FGSAM_NeurIPS24. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16845v1-abstract-full').style.display = 'none'; document.getElementById('2410.16845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS24; The first two authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15070">arXiv:2410.15070</a> <span> [<a href="https://arxiv.org/pdf/2410.15070">pdf</a>, <a href="https://arxiv.org/ps/2410.15070">ps</a>, <a href="https://arxiv.org/format/2410.15070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> The dual codes of two families of BCH codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haojie Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xia Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiwang Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15070v2-abstract-short" style="display: inline;"> In this paper, we present an infinite family of MDS codes over $\mathbb{F}_{2^s}$ and two infinite families of almost MDS codes over $\mathbb{F}_{p^s}$ for any prime $p$, by investigating the parameters of the dual codes of two families of BCH codes. Notably, these almost MDS codes include two infinite families of near MDS codes over $\mathbb{F}_{3^s}$, resolving a conjecture posed by Geng et al.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15070v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15070v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15070v2-abstract-full" style="display: none;"> In this paper, we present an infinite family of MDS codes over $\mathbb{F}_{2^s}$ and two infinite families of almost MDS codes over $\mathbb{F}_{p^s}$ for any prime $p$, by investigating the parameters of the dual codes of two families of BCH codes. Notably, these almost MDS codes include two infinite families of near MDS codes over $\mathbb{F}_{3^s}$, resolving a conjecture posed by Geng et al. in 2022. Furthermore, we demonstrate that both of these almost AMDS codes and their dual codes hold infinite families of $3$-designs over \(\mathbb{F}_{p^s}\) for any prime $p$. Additionally, we study the subfield subcodes of these families of MDS and near MDS codes, and provide several binary, ternary, and quaternary codes with best known parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15070v2-abstract-full').style.display = 'none'; document.getElementById('2410.15070v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14389">arXiv:2410.14389</a> <span> [<a href="https://arxiv.org/pdf/2410.14389">pdf</a>, <a href="https://arxiv.org/format/2410.14389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SurgeryV2: Bridging the Gap Between Model Merging and Multi-Task Learning with Deep Representation Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+E">Enneng Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenyi Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Guibing Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingwei Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaocun Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14389v1-abstract-short" style="display: inline;"> Model merging-based multitask learning (MTL) offers a promising approach for performing MTL by merging multiple expert models without requiring access to raw training data. However, in this paper, we examine the merged model's representation distribution and uncover a critical issue of "representation bias". This bias arises from a significant distribution gap between the representations of the me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14389v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14389v1-abstract-full" style="display: none;"> Model merging-based multitask learning (MTL) offers a promising approach for performing MTL by merging multiple expert models without requiring access to raw training data. However, in this paper, we examine the merged model's representation distribution and uncover a critical issue of "representation bias". This bias arises from a significant distribution gap between the representations of the merged and expert models, leading to the suboptimal performance of the merged MTL model. To address this challenge, we first propose a representation surgery solution called Surgery. Surgery is a lightweight, task-specific module that aligns the final layer representations of the merged model with those of the expert models, effectively alleviating bias and improving the merged model's performance. Despite these improvements, a performance gap remains compared to the traditional MTL method. Further analysis reveals that representation bias phenomena exist at each layer of the merged model, and aligning representations only in the last layer is insufficient for fully reducing systemic bias because biases introduced at each layer can accumulate and interact in complex ways. To tackle this, we then propose a more comprehensive solution, deep representation surgery (also called SurgeryV2), which mitigates representation bias across all layers, and thus bridges the performance gap between model merging-based MTL and traditional MTL. Finally, we design an unsupervised optimization objective to optimize both the Surgery and SurgeryV2 modules. Our experimental results show that incorporating these modules into state-of-the-art (SOTA) model merging schemes leads to significant performance gains. Notably, our SurgeryV2 scheme reaches almost the same level as individual expert models or the traditional MTL model. The code is available at \url{https://github.com/EnnengYang/SurgeryV2}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14389v1-abstract-full').style.display = 'none'; document.getElementById('2410.14389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is an extended version of our previous work [arXiv:2402.02705] presented at ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12138">arXiv:2410.12138</a> <span> [<a href="https://arxiv.org/pdf/2410.12138">pdf</a>, <a href="https://arxiv.org/format/2410.12138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Preference Optimization with Multi-Sample Comparisons </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhuokai Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chen Zhu</a>, <a href="/search/cs?searchtype=author&query=Sankararaman%2C+K+A">Karthik Abinav Sankararaman</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuefei Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaorun Chen</a>, <a href="/search/cs?searchtype=author&query=Khabsa%2C+M">Madian Khabsa</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hao Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sinong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12138v1-abstract-short" style="display: inline;"> Recent advancements in generative models, particularly large language models (LLMs) and diffusion models, have been driven by extensive pretraining on large datasets followed by post-training. However, current post-training methods such as reinforcement learning from human feedback (RLHF) and direct alignment from preference methods (DAP) primarily utilize single-sample comparisons. These approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12138v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12138v1-abstract-full" style="display: none;"> Recent advancements in generative models, particularly large language models (LLMs) and diffusion models, have been driven by extensive pretraining on large datasets followed by post-training. However, current post-training methods such as reinforcement learning from human feedback (RLHF) and direct alignment from preference methods (DAP) primarily utilize single-sample comparisons. These approaches often fail to capture critical characteristics such as generative diversity and bias, which are more accurately assessed through multiple samples. To address these limitations, we introduce a novel approach that extends post-training to include multi-sample comparisons. To achieve this, we propose Multi-sample Direct Preference Optimization (mDPO) and Multi-sample Identity Preference Optimization (mIPO). These methods improve traditional DAP methods by focusing on group-wise characteristics. Empirically, we demonstrate that multi-sample comparison is more effective in optimizing collective characteristics~(e.g., diversity and bias) for generative models than single-sample comparison. Additionally, our findings suggest that multi-sample comparisons provide a more robust optimization framework, particularly for dataset with label noise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12138v1-abstract-full').style.display = 'none'; document.getElementById('2410.12138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10365">arXiv:2410.10365</a> <span> [<a href="https://arxiv.org/pdf/2410.10365">pdf</a>, <a href="https://arxiv.org/format/2410.10365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpeGCL: Self-supervised Graph Spectrum Contrastive Learning without Positive Samples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shou%2C+Y">Yuntao Shou</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10365v1-abstract-short" style="display: inline;"> Graph Contrastive Learning (GCL) excels at managing noise and fluctuations in input data, making it popular in various fields (e.g., social networks, and knowledge graphs). Our study finds that the difference in high-frequency information between augmented graphs is greater than that in low-frequency information. However, most existing GCL methods focus mainly on the time domain (low-frequency inf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10365v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10365v1-abstract-full" style="display: none;"> Graph Contrastive Learning (GCL) excels at managing noise and fluctuations in input data, making it popular in various fields (e.g., social networks, and knowledge graphs). Our study finds that the difference in high-frequency information between augmented graphs is greater than that in low-frequency information. However, most existing GCL methods focus mainly on the time domain (low-frequency information) for node feature representations and cannot make good use of high-frequency information to speed up model convergence. Furthermore, existing GCL paradigms optimize graph embedding representations by pulling the distance between positive sample pairs closer and pushing the distance between positive and negative sample pairs farther away, but our theoretical analysis shows that graph contrastive learning benefits from pushing negative pairs farther away rather than pulling positive pairs closer. To solve the above-mentioned problems, we propose a novel spectral GCL framework without positive samples, named SpeGCL. Specifically, to solve the problem that existing GCL methods cannot utilize high-frequency information, SpeGCL uses a Fourier transform to extract high-frequency and low-frequency information of node features, and constructs a contrastive learning mechanism in a Fourier space to obtain better node feature representation. Furthermore, SpeGCL relies entirely on negative samples to refine the graph embedding. We also provide a theoretical justification for the efficacy of using only negative samples in SpeGCL. Extensive experiments on un-supervised learning, transfer learning, and semi-supervised learning have validated the superiority of our SpeGCL framework over the state-of-the-art GCL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10365v1-abstract-full').style.display = 'none'; document.getElementById('2410.10365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08688">arXiv:2410.08688</a> <span> [<a href="https://arxiv.org/pdf/2410.08688">pdf</a>, <a href="https://arxiv.org/format/2410.08688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Chain-of-Restoration: Multi-Task Image Restoration Models are Zero-Shot Step-by-Step Universal Image Restorers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jin Cao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08688v1-abstract-short" style="display: inline;"> Despite previous works typically targeting isolated degradation types, recent research has increasingly focused on addressing composite degradations which involve a complex interplay of multiple different isolated degradations. Recognizing the challenges posed by the exponential number of possible degradation combinations, we propose Universal Image Restoration (UIR), a new task setting that requi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08688v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08688v1-abstract-full" style="display: none;"> Despite previous works typically targeting isolated degradation types, recent research has increasingly focused on addressing composite degradations which involve a complex interplay of multiple different isolated degradations. Recognizing the challenges posed by the exponential number of possible degradation combinations, we propose Universal Image Restoration (UIR), a new task setting that requires models to be trained on a set of degradation bases and then remove any degradation that these bases can potentially compose in a zero-shot manner. Inspired by the Chain-of-Thought which prompts LLMs to address problems step-by-step, we propose the Chain-of-Restoration (CoR), which instructs models to step-by-step remove unknown composite degradations. By integrating a simple Degradation Discriminator into pre-trained multi-task models, CoR facilitates the process where models remove one degradation basis per step, continuing this process until the image is fully restored from the unknown composite degradation. Extensive experiments show that CoR significantly improves model performance in removing composite degradations, achieving results comparable to or surpassing those of State-of-The-Art (SoTA) methods trained on all degradations. The code will be released at https://github.com/toummHus/Chain-of-Restoration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08688v1-abstract-full').style.display = 'none'; document.getElementById('2410.08688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07272">arXiv:2410.07272</a> <span> [<a href="https://arxiv.org/pdf/2410.07272">pdf</a>, <a href="https://arxiv.org/format/2410.07272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Boosting the Performance of Decentralized Federated Learning via Catalyst Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinglun Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yingqi Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Quanjun Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07272v1-abstract-short" style="display: inline;"> Decentralized Federated Learning has emerged as an alternative to centralized architectures due to its faster training, privacy preservation, and reduced communication overhead. In decentralized communication, the server aggregation phase in Centralized Federated Learning shifts to the client side, which means that clients connect with each other in a peer-to-peer manner. However, compared to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07272v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07272v1-abstract-full" style="display: none;"> Decentralized Federated Learning has emerged as an alternative to centralized architectures due to its faster training, privacy preservation, and reduced communication overhead. In decentralized communication, the server aggregation phase in Centralized Federated Learning shifts to the client side, which means that clients connect with each other in a peer-to-peer manner. However, compared to the centralized mode, data heterogeneity in Decentralized Federated Learning will cause larger variances between aggregated models, which leads to slow convergence in training and poor generalization performance in tests. To address these issues, we introduce Catalyst Acceleration and propose an acceleration Decentralized Federated Learning algorithm called DFedCata. It consists of two main components: the Moreau envelope function, which primarily addresses parameter inconsistencies among clients caused by data heterogeneity, and Nesterov's extrapolation step, which accelerates the aggregation phase. Theoretically, we prove the optimization error bound and generalization error bound of the algorithm, providing a further understanding of the nature of the algorithm and the theoretical perspectives on the hyperparameter choice. Empirically, we demonstrate the advantages of the proposed algorithm in both convergence speed and generalization performance on CIFAR10/100 with various non-iid data distributions. Furthermore, we also experimentally verify the theoretical properties of DFedCata. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07272v1-abstract-full').style.display = 'none'; document.getElementById('2410.07272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2410.06482</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07051">arXiv:2410.07051</a> <span> [<a href="https://arxiv.org/pdf/2410.07051">pdf</a>, <a href="https://arxiv.org/format/2410.07051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Exponents for Shared Randomness-Assisted Channel Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oufkir%2C+A">Aadil Oufkir</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M+X">Michael X. Cao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao-Chung Cheng</a>, <a href="/search/cs?searchtype=author&query=Berta%2C+M">Mario Berta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07051v1-abstract-short" style="display: inline;"> We determine the exact error and strong converse exponents of shared randomness-assisted channel simulation in worst case total-variation distance. Namely, we find that these exponents can be written as simple optimizations over the R茅nyi channel mutual information. Strikingly, and in stark contrast to channel coding, there are no critical rates, allowing a tight characterization for arbitrary rat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07051v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07051v1-abstract-full" style="display: none;"> We determine the exact error and strong converse exponents of shared randomness-assisted channel simulation in worst case total-variation distance. Namely, we find that these exponents can be written as simple optimizations over the R茅nyi channel mutual information. Strikingly, and in stark contrast to channel coding, there are no critical rates, allowing a tight characterization for arbitrary rates below and above the simulation capacity. We derive our results by asymptotically expanding the meta-converse for channel simulation [Cao {\it et al.}, IEEE Trans.~Inf.~Theory (2024)], which corresponds to non-signaling assisted codes. We prove this to be asymptotically tight by employing the approximation algorithms from [Berta {\it et al.}, Proc.~IEEE ISIT (2024)], which show how to round any non-signaling assisted strategy to a strategy that only uses shared randomness. Notably, this implies that any additional quantum entanglement-assistance does not change the error or the strong converse exponents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07051v1-abstract-full').style.display = 'none'; document.getElementById('2410.07051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27+6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06719">arXiv:2410.06719</a> <span> [<a href="https://arxiv.org/pdf/2410.06719">pdf</a>, <a href="https://arxiv.org/format/2410.06719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Suppress Content Shift: Better Diffusion Features via Off-the-Shelf Generation Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+B">Benyuan Meng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qianqian Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zitai Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06719v3-abstract-short" style="display: inline;"> Diffusion models are powerful generative models, and this capability can also be applied to discrimination. The inner activations of a pre-trained diffusion model can serve as features for discriminative tasks, namely, diffusion feature. We discover that diffusion feature has been hindered by a hidden yet universal phenomenon that we call content shift. To be specific, there are content difference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06719v3-abstract-full').style.display = 'inline'; document.getElementById('2410.06719v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06719v3-abstract-full" style="display: none;"> Diffusion models are powerful generative models, and this capability can also be applied to discrimination. The inner activations of a pre-trained diffusion model can serve as features for discriminative tasks, namely, diffusion feature. We discover that diffusion feature has been hindered by a hidden yet universal phenomenon that we call content shift. To be specific, there are content differences between features and the input image, such as the exact shape of a certain object. We locate the cause of content shift as one inherent characteristic of diffusion models, which suggests the broad existence of this phenomenon in diffusion feature. Further empirical study also indicates that its negative impact is not negligible even when content shift is not visually perceivable. Hence, we propose to suppress content shift to enhance the overall quality of diffusion features. Specifically, content shift is related to the information drift during the process of recovering an image from the noisy input, pointing out the possibility of turning off-the-shelf generation techniques into tools for content shift suppression. We further propose a practical guideline named GATE to efficiently evaluate the potential benefit of a technique and provide an implementation of our methodology. Despite the simplicity, the proposed approach has achieved superior results on various tasks and datasets, validating its potential as a generic booster for diffusion features. Our code is available at https://github.com/Darkbblue/diffusion-content-shift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06719v3-abstract-full').style.display = 'none'; document.getElementById('2410.06719v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2410.03558</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04313">arXiv:2410.04313</a> <span> [<a href="https://arxiv.org/pdf/2410.04313">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Vehicle-in-Virtual-Environment Method for ADAS and Connected and Automated Driving Function Development/Demonstration/Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xincheng Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haochong Chen</a>, <a href="/search/cs?searchtype=author&query=Aksun-Guvenc%2C+B">Bilin Aksun-Guvenc</a>, <a href="/search/cs?searchtype=author&query=Guvenc%2C+L">Levent Guvenc</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04313v1-abstract-short" style="display: inline;"> The current approach for new Advanced Driver Assistance System (ADAS) and Connected and Automated Driving (CAD) function development involves a significant amount of public road testing which is inefficient due to the number miles that need to be driven for rare and extreme events to take place, thereby being very costly also, and unsafe as the rest of the road users become involuntary test subjec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04313v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04313v1-abstract-full" style="display: none;"> The current approach for new Advanced Driver Assistance System (ADAS) and Connected and Automated Driving (CAD) function development involves a significant amount of public road testing which is inefficient due to the number miles that need to be driven for rare and extreme events to take place, thereby being very costly also, and unsafe as the rest of the road users become involuntary test subjects. A new development, evaluation and demonstration method for safe, efficient, and repeatable development, demonstration and evaluation of ADAS and CAD functions called VehicleInVirtualEnvironment (VVE) was recently introduced as a solution to this problem. The vehicle is operated in a large, empty, and flat area during VVE while its localization and perception sensor data is fed from the virtual environment with other traffic and rare and extreme events being generated as needed. The virtual environment can be easily configured and modified to construct different testing scenarios on demand. This paper focuses on the VVE approach and introduces the coordinate transformations needed to sync pose (location and orientation) in the virtual and physical worlds and handling of localization and perception sensor data using the highly realistic 3D simulation model of a recent autonomous shuttle deployment site in Columbus, Ohio as the virtual world. As a further example that uses multiple actors, the use of VVE for VehicleToVRU communication based Vulnerable Road User (VRU) safety is presented in the paper using VVE experiments and real pedestrian(s) in a safe and repeatable manner. VVE experiments are used to demonstrate the efficacy of the method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04313v1-abstract-full').style.display = 'none'; document.getElementById('2410.04313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04260">arXiv:2410.04260</a> <span> [<a href="https://arxiv.org/pdf/2410.04260">pdf</a>, <a href="https://arxiv.org/format/2410.04260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Pareto Control Barrier Function for Inner Safe Set Maximization Under Input Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaoyang Cao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhe Fu</a>, <a href="/search/cs?searchtype=author&query=Bayen%2C+A+M">Alexandre M. Bayen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04260v1-abstract-short" style="display: inline;"> This article introduces the Pareto Control Barrier Function (PCBF) algorithm to maximize the inner safe set of dynamical systems under input constraints. Traditional Control Barrier Functions (CBFs) ensure safety by maintaining system trajectories within a safe set but often fail to account for realistic input constraints. To address this problem, we leverage the Pareto multi-task learning framewo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04260v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04260v1-abstract-full" style="display: none;"> This article introduces the Pareto Control Barrier Function (PCBF) algorithm to maximize the inner safe set of dynamical systems under input constraints. Traditional Control Barrier Functions (CBFs) ensure safety by maintaining system trajectories within a safe set but often fail to account for realistic input constraints. To address this problem, we leverage the Pareto multi-task learning framework to balance competing objectives of safety and safe set volume. The PCBF algorithm is applicable to high-dimensional systems and is computationally efficient. We validate its effectiveness through comparison with Hamilton-Jacobi reachability for an inverted pendulum and through simulations on a 12-dimensional quadrotor system. Results show that the PCBF consistently outperforms existing methods, yielding larger safe sets and ensuring safety under input constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04260v1-abstract-full').style.display = 'none'; document.getElementById('2410.04260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ACC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03558">arXiv:2410.03558</a> <span> [<a href="https://arxiv.org/pdf/2410.03558">pdf</a>, <a href="https://arxiv.org/format/2410.03558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Not All Diffusion Model Activations Have Been Evaluated as Discriminative Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+B">Benyuan Meng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qianqian Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zitai Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03558v3-abstract-short" style="display: inline;"> Diffusion models are initially designed for image generation. Recent research shows that the internal signals within their backbones, named activations, can also serve as dense features for various discriminative tasks such as semantic segmentation. Given numerous activations, selecting a small yet effective subset poses a fundamental problem. To this end, the early study of this field performs a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03558v3-abstract-full').style.display = 'inline'; document.getElementById('2410.03558v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03558v3-abstract-full" style="display: none;"> Diffusion models are initially designed for image generation. Recent research shows that the internal signals within their backbones, named activations, can also serve as dense features for various discriminative tasks such as semantic segmentation. Given numerous activations, selecting a small yet effective subset poses a fundamental problem. To this end, the early study of this field performs a large-scale quantitative comparison of the discriminative ability of the activations. However, we find that many potential activations have not been evaluated, such as the queries and keys used to compute attention scores. Moreover, recent advancements in diffusion architectures bring many new activations, such as those within embedded ViT modules. Both combined, activation selection remains unresolved but overlooked. To tackle this issue, this paper takes a further step with a much broader range of activations evaluated. Considering the significant increase in activations, a full-scale quantitative comparison is no longer operational. Instead, we seek to understand the properties of these activations, such that the activations that are clearly inferior can be filtered out in advance via simple qualitative evaluation. After careful analysis, we discover three properties universal among diffusion models, enabling this study to go beyond specific models. On top of this, we present effective feature selection solutions for several popular diffusion models. Finally, the experiments across multiple discriminative tasks validate the superiority of our method over the SOTA competitors. Our code is available at https://github.com/Darkbblue/generic-diffusion-feature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03558v3-abstract-full').style.display = 'none'; document.getElementById('2410.03558v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01768">arXiv:2410.01768</a> <span> [<a href="https://arxiv.org/pdf/2410.01768">pdf</a>, <a href="https://arxiv.org/format/2410.01768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SegEarth-OV: Towards Training-Free Open-Vocabulary Segmentation for Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiyu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruixun Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangyong Cao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xueru Bai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Deyu Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01768v2-abstract-short" style="display: inline;"> Remote sensing image plays an irreplaceable role in fields such as agriculture, water resources, military, and disaster relief. Pixel-level interpretation is a critical aspect of remote sensing image applications; however, a prevalent limitation remains the need for extensive manual annotation. For this, we try to introduce open-vocabulary semantic segmentation (OVSS) into the remote sensing conte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01768v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01768v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01768v2-abstract-full" style="display: none;"> Remote sensing image plays an irreplaceable role in fields such as agriculture, water resources, military, and disaster relief. Pixel-level interpretation is a critical aspect of remote sensing image applications; however, a prevalent limitation remains the need for extensive manual annotation. For this, we try to introduce open-vocabulary semantic segmentation (OVSS) into the remote sensing context. However, due to the sensitivity of remote sensing images to low-resolution features, distorted target shapes and ill-fitting boundaries are exhibited in the prediction mask. To tackle this issue, we propose a simple and general upsampler, SimFeatUp, to restore lost spatial information in deep features in a training-free style. Further, based on the observation of the abnormal response of local patch tokens to [CLS] token in CLIP, we propose to execute a straightforward subtraction operation to alleviate the global bias in patch tokens. Extensive experiments are conducted on 17 remote sensing datasets spanning semantic segmentation, building extraction, road detection, and flood detection tasks. Our method achieves an average of 5.8%, 8.2%, 4.0%, and 15.3% improvement over state-of-the-art methods on 4 tasks. All codes are released. \url{https://earth-insights.github.io/SegEarth-OV} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01768v2-abstract-full').style.display = 'none'; document.getElementById('2410.01768v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01226">arXiv:2410.01226</a> <span> [<a href="https://arxiv.org/pdf/2410.01226">pdf</a>, <a href="https://arxiv.org/format/2410.01226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Native Generative Model for 3D Head Avatar </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yiyu Zhuang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiao He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanwen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiahe Zhu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01226v1-abstract-short" style="display: inline;"> Creating 3D head avatars is a significant yet challenging task for many applicated scenarios. Previous studies have set out to learn 3D human head generative models using massive 2D image data. Although these models are highly generalizable for human appearance, their result models are not 360$^\circ$-renderable, and the predicted 3D geometry is unreliable. Therefore, such results cannot be used i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01226v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01226v1-abstract-full" style="display: none;"> Creating 3D head avatars is a significant yet challenging task for many applicated scenarios. Previous studies have set out to learn 3D human head generative models using massive 2D image data. Although these models are highly generalizable for human appearance, their result models are not 360$^\circ$-renderable, and the predicted 3D geometry is unreliable. Therefore, such results cannot be used in VR, game modeling, and other scenarios that require 360$^\circ$-renderable 3D head models. An intuitive idea is that 3D head models with limited amount but high 3D accuracy are more reliable training data for a high-quality 3D generative model. In this vein, we delve into how to learn a native generative model for 360$^\circ$ full head from a limited 3D head dataset. Specifically, three major problems are studied: 1) how to effectively utilize various representations for generating the 360$^\circ$-renderable human head; 2) how to disentangle the appearance, shape, and motion of human faces to generate a 3D head model that can be edited by appearance and driven by motion; 3) and how to extend the generalization capability of the generative model to support downstream tasks. Comprehensive experiments are conducted to verify the effectiveness of the proposed model. We hope the proposed models and artist-designed dataset can inspire future research on learning native generative 3D head models from limited 3D datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01226v1-abstract-full').style.display = 'none'; document.getElementById('2410.01226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19679">arXiv:2409.19679</a> <span> [<a href="https://arxiv.org/pdf/2409.19679">pdf</a>, <a href="https://arxiv.org/format/2409.19679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SemiDDM-Weather: A Semi-supervised Learning Framework for All-in-one Adverse Weather Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+F">Fang Long</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenkang Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zixuan Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+L">Lei Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan-Gen Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19679v1-abstract-short" style="display: inline;"> Adverse weather removal aims to restore clear vision under adverse weather conditions. Existing methods are mostly tailored for specific weather types and rely heavily on extensive labeled data. In dealing with these two limitations, this paper presents a pioneering semi-supervised all-in-one adverse weather removal framework built on the teacher-student network with a Denoising Diffusion Model (D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19679v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19679v1-abstract-full" style="display: none;"> Adverse weather removal aims to restore clear vision under adverse weather conditions. Existing methods are mostly tailored for specific weather types and rely heavily on extensive labeled data. In dealing with these two limitations, this paper presents a pioneering semi-supervised all-in-one adverse weather removal framework built on the teacher-student network with a Denoising Diffusion Model (DDM) as the backbone, termed SemiDDM-Weather. As for the design of DDM backbone in our SemiDDM-Weather, we adopt the SOTA Wavelet Diffusion Model-Wavediff with customized inputs and loss functions, devoted to facilitating the learning of many-to-one mapping distributions for efficient all-in-one adverse weather removal with limited label data. To mitigate the risk of misleading model training due to potentially inaccurate pseudo-labels generated by the teacher network in semi-supervised learning, we introduce quality assessment and content consistency constraints to screen the "optimal" outputs from the teacher network as the pseudo-labels, thus more effectively guiding the student network training with unlabeled data. Experimental results show that on both synthetic and real-world datasets, our SemiDDM-Weather consistently delivers high visual quality and superior adverse weather removal, even when compared to fully supervised competitors. Our code and pre-trained model are available at this repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19679v1-abstract-full').style.display = 'none'; document.getElementById('2409.19679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19526">arXiv:2409.19526</a> <span> [<a href="https://arxiv.org/pdf/2409.19526">pdf</a>, <a href="https://arxiv.org/format/2409.19526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Backdoor Defense in Multimodal Contrastive Learning: A Token-Level Unlearning Method for Mitigating Threats </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kuanrong Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiawei Liang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+P">Pengwen Dai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19526v1-abstract-short" style="display: inline;"> Multimodal contrastive learning uses various data modalities to create high-quality features, but its reliance on extensive data sources on the Internet makes it vulnerable to backdoor attacks. These attacks insert malicious behaviors during training, which are activated by specific triggers during inference, posing significant security risks. Despite existing countermeasures through fine-tuning t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19526v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19526v1-abstract-full" style="display: none;"> Multimodal contrastive learning uses various data modalities to create high-quality features, but its reliance on extensive data sources on the Internet makes it vulnerable to backdoor attacks. These attacks insert malicious behaviors during training, which are activated by specific triggers during inference, posing significant security risks. Despite existing countermeasures through fine-tuning that reduce the malicious impacts of such attacks, these defenses frequently necessitate extensive training time and degrade clean accuracy. In this study, we propose an efficient defense mechanism against backdoor threats using a concept known as machine unlearning. This entails strategically creating a small set of poisoned samples to aid the model's rapid unlearning of backdoor vulnerabilities, known as Unlearn Backdoor Threats (UBT). We specifically use overfit training to improve backdoor shortcuts and accurately detect suspicious samples in the potential poisoning data set. Then, we select fewer unlearned samples from suspicious samples for rapid forgetting in order to eliminate the backdoor effect and thus improve backdoor defense efficiency. In the backdoor unlearning process, we present a novel token-based portion unlearning training regime. This technique focuses on the model's compromised elements, dissociating backdoor correlations while maintaining the model's overall integrity. Extensive experimental results show that our method effectively defends against various backdoor attack methods in the CLIP model. Compared to SoTA backdoor defense methods, UBT achieves the lowest attack success rate while maintaining a high clean accuracy of the model (attack success rate decreases by 19% compared to SOTA, while clean accuracy increases by 2.57%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19526v1-abstract-full').style.display = 'none'; document.getElementById('2409.19526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19042">arXiv:2409.19042</a> <span> [<a href="https://arxiv.org/pdf/2409.19042">pdf</a>, <a href="https://arxiv.org/format/2409.19042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Probing mental health information in speech foundation models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=de+Gennes%2C+M">Marc de Gennes</a>, <a href="/search/cs?searchtype=author&query=Lesage%2C+A">Adrien Lesage</a>, <a href="/search/cs?searchtype=author&query=Denais%2C+M">Martin Denais</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuan-Nga Cao</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Simon Chang</a>, <a href="/search/cs?searchtype=author&query=Van+Remoortere%2C+P">Pierre Van Remoortere</a>, <a href="/search/cs?searchtype=author&query=Dakhlia%2C+C">Cyrille Dakhlia</a>, <a href="/search/cs?searchtype=author&query=Riad%2C+R">Rachid Riad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19042v1-abstract-short" style="display: inline;"> Non-invasive methods for diagnosing mental health conditions, such as speech analysis, offer promising potential in modern medicine. Recent advancements in machine learning, particularly speech foundation models, have shown significant promise in detecting mental health states by capturing diverse features. This study investigates which pretext tasks in these models best transfer to mental health… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19042v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19042v1-abstract-full" style="display: none;"> Non-invasive methods for diagnosing mental health conditions, such as speech analysis, offer promising potential in modern medicine. Recent advancements in machine learning, particularly speech foundation models, have shown significant promise in detecting mental health states by capturing diverse features. This study investigates which pretext tasks in these models best transfer to mental health detection and examines how different model layers encode features relevant to mental health conditions. We also probed the optimal length of audio segments and the best pooling strategies to improve detection accuracy. Using the Callyope-GP and Androids datasets, we evaluated the models' effectiveness across different languages and speech tasks, aiming to enhance the generalizability of speech-based mental health diagnostics. Our approach achieved SOTA scores in depression detection on the Androids dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19042v1-abstract-full').style.display = 'none'; document.getElementById('2409.19042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17681">arXiv:2409.17681</a> <span> [<a href="https://arxiv.org/pdf/2409.17681">pdf</a>, <a href="https://arxiv.org/format/2409.17681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Computation Pre-Offloading for MEC-Enabled Vehicular Networks via Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Ting Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuelin Cao</a>, <a href="/search/cs?searchtype=author&query=Alexandropoulos%2C+G+C">George C. Alexandropoulos</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17681v1-abstract-short" style="display: inline;"> Task offloading is of paramount importance to efficiently orchestrate vehicular wireless networks, necessitating the availability of information regarding the current network status and computational resources. However, due to the mobility of the vehicles and the limited computational resources for performing task offloading in near-real-time, such schemes may require high latency, thus, become ev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17681v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17681v1-abstract-full" style="display: none;"> Task offloading is of paramount importance to efficiently orchestrate vehicular wireless networks, necessitating the availability of information regarding the current network status and computational resources. However, due to the mobility of the vehicles and the limited computational resources for performing task offloading in near-real-time, such schemes may require high latency, thus, become even infeasible. To address this issue, in this paper, we present a Trajectory Prediction-based Pre-offloading Decision (TPPD) algorithm for analyzing the historical trajectories of vehicles to predict their future coordinates, thereby allowing for computational resource allocation in advance. We first utilize the Long Short-Term Memory (LSTM) network model to predict each vehicle's movement trajectory. Then, based on the task requirements and the predicted trajectories, we devise a dynamic resource allocation algorithm using a Double Deep Q-Network (DDQN) that enables the edge server to minimize task processing delay, while ensuring effective utilization of the available computational resources. Our simulation results verify the effectiveness of the proposed approach, showcasing that, as compared with traditional real-time task offloading strategies, the proposed TPPD algorithm significantly reduces task processing delay while improving resource utilization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17681v1-abstract-full').style.display = 'none'; document.getElementById('2409.17681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17634">arXiv:2409.17634</a> <span> [<a href="https://arxiv.org/pdf/2409.17634">pdf</a>, <a href="https://arxiv.org/format/2409.17634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> P4Q: Learning to Prompt for Quantization in Visual-language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huixin Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runqi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanjing Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xianbin Cao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17634v1-abstract-short" style="display: inline;"> Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence in various visual and multimodal tasks, yet the deployment of VLMs on downstream application platforms remains challenging due to their prohibitive requirements of training samples and computing resources. Fine-tuning and quantization of VLMs can substantially reduce the sample and computation costs, which are in urgent n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17634v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17634v1-abstract-full" style="display: none;"> Large-scale pre-trained Vision-Language Models (VLMs) have gained prominence in various visual and multimodal tasks, yet the deployment of VLMs on downstream application platforms remains challenging due to their prohibitive requirements of training samples and computing resources. Fine-tuning and quantization of VLMs can substantially reduce the sample and computation costs, which are in urgent need. There are two prevailing paradigms in quantization, Quantization-Aware Training (QAT) can effectively quantize large-scale VLMs but incur a huge training cost, while low-bit Post-Training Quantization (PTQ) suffers from a notable performance drop. We propose a method that balances fine-tuning and quantization named ``Prompt for Quantization'' (P4Q), in which we design a lightweight architecture to leverage contrastive loss supervision to enhance the recognition performance of a PTQ model. Our method can effectively reduce the gap between image features and text features caused by low-bit quantization, based on learnable prompts to reorganize textual representations and a low-bit adapter to realign the distributions of image and text features. We also introduce a distillation loss based on cosine similarity predictions to distill the quantized model using a full-precision teacher. Extensive experimental results demonstrate that our P4Q method outperforms prior arts, even achieving comparable results to its full-precision counterparts. For instance, our 8-bit P4Q can theoretically compress the CLIP-ViT/B-32 by 4 $\times$ while achieving 66.94\% Top-1 accuracy, outperforming the learnable prompt fine-tuned full-precision model by 2.24\% with negligible additional parameters on the ImageNet dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17634v1-abstract-full').style.display = 'none'; document.getElementById('2409.17634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17601">arXiv:2409.17601</a> <span> [<a href="https://arxiv.org/pdf/2409.17601">pdf</a>, <a href="https://arxiv.org/format/2409.17601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CleanerCLIP: Fine-grained Counterfactual Semantic Augmentation for Backdoor Defense in Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xun%2C+Y">Yuan Xun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinwei Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17601v3-abstract-short" style="display: inline;"> Pre-trained large models for multimodal contrastive learning, such as CLIP, have been widely recognized in the industry as highly susceptible to data-poisoned backdoor attacks. This poses significant risks to downstream model training. In response to such potential threats, finetuning offers a simpler and more efficient defense choice compared to retraining large models with augmented data. In the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17601v3-abstract-full').style.display = 'inline'; document.getElementById('2409.17601v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17601v3-abstract-full" style="display: none;"> Pre-trained large models for multimodal contrastive learning, such as CLIP, have been widely recognized in the industry as highly susceptible to data-poisoned backdoor attacks. This poses significant risks to downstream model training. In response to such potential threats, finetuning offers a simpler and more efficient defense choice compared to retraining large models with augmented data. In the supervised learning domain, fine-tuning defense strategies can achieve excellent defense performance. However, in the unsupervised and semi-supervised domain, we find that when CLIP faces some complex attack techniques, the existing fine-tuning defense strategy, CleanCLIP, has some limitations on defense performance. The synonym substitution of its text-augmentation is insufficient to enhance the text feature space. To compensate for this weakness, we improve it by proposing a fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to cut off feature connections of backdoor triggers. We randomly select a few samples for positive and negative subtext generation at each epoch of CleanCLIP, and align the subtexts to the images to strengthen the text self-supervision. We evaluate the effectiveness of our TA-Cleaner against six attack algorithms and conduct comprehensive zero-shot classification tests on ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves state-of-the-art defensiveness among finetuning-based defense techniques. Even when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17601v3-abstract-full').style.display = 'none'; document.getElementById('2409.17601v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17058">arXiv:2409.17058</a> <span> [<a href="https://arxiv.org/pdf/2409.17058">pdf</a>, <a href="https://arxiv.org/format/2409.17058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Degradation-Guided One-Step Image Super-Resolution with Diffusion Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aiping Zhang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Z">Zongsheng Yue</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+R">Renjing Pei</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Wenqi Ren</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17058v1-abstract-short" style="display: inline;"> Diffusion-based image super-resolution (SR) methods have achieved remarkable success by leveraging large pre-trained text-to-image diffusion models as priors. However, these methods still face two challenges: the requirement for dozens of sampling steps to achieve satisfactory results, which limits efficiency in real scenarios, and the neglect of degradation models, which are critical auxiliary in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17058v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17058v1-abstract-full" style="display: none;"> Diffusion-based image super-resolution (SR) methods have achieved remarkable success by leveraging large pre-trained text-to-image diffusion models as priors. However, these methods still face two challenges: the requirement for dozens of sampling steps to achieve satisfactory results, which limits efficiency in real scenarios, and the neglect of degradation models, which are critical auxiliary information in solving the SR problem. In this work, we introduced a novel one-step SR model, which significantly addresses the efficiency issue of diffusion-based SR methods. Unlike existing fine-tuning strategies, we designed a degradation-guided Low-Rank Adaptation (LoRA) module specifically for SR, which corrects the model parameters based on the pre-estimated degradation information from low-resolution images. This module not only facilitates a powerful data-dependent or degradation-dependent SR model but also preserves the generative prior of the pre-trained diffusion model as much as possible. Furthermore, we tailor a novel training pipeline by introducing an online negative sample generation strategy. Combined with the classifier-free guidance strategy during inference, it largely improves the perceptual quality of the super-resolution results. Extensive experiments have demonstrated the superior efficiency and effectiveness of the proposed model compared to recent state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17058v1-abstract-full').style.display = 'none'; document.getElementById('2409.17058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is available at https://github.com/ArcticHare105/S3Diff</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15968">arXiv:2409.15968</a> <span> [<a href="https://arxiv.org/pdf/2409.15968">pdf</a>, <a href="https://arxiv.org/format/2409.15968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Backdoor Defense in CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuang%2C+J">Junhao Kuang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiawei Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kuanrong Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15968v1-abstract-short" style="display: inline;"> Multimodal contrastive pretraining, exemplified by models like CLIP, has been found to be vulnerable to backdoor attacks. While current backdoor defense methods primarily employ conventional data augmentation to create augmented samples aimed at feature alignment, these methods fail to capture the distinct features of backdoor samples, resulting in suboptimal defense performance. Observations reve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15968v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15968v1-abstract-full" style="display: none;"> Multimodal contrastive pretraining, exemplified by models like CLIP, has been found to be vulnerable to backdoor attacks. While current backdoor defense methods primarily employ conventional data augmentation to create augmented samples aimed at feature alignment, these methods fail to capture the distinct features of backdoor samples, resulting in suboptimal defense performance. Observations reveal that adversarial examples and backdoor samples exhibit similarities in the feature space within the compromised models. Building on this insight, we propose Adversarial Backdoor Defense (ABD), a novel data augmentation strategy that aligns features with meticulously crafted adversarial examples. This approach effectively disrupts the backdoor association. Our experiments demonstrate that ABD provides robust defense against both traditional uni-modal and multimodal backdoor attacks targeting CLIP. Compared to the current state-of-the-art defense method, CleanCLIP, ABD reduces the attack success rate by 8.66% for BadNet, 10.52% for Blended, and 53.64% for BadCLIP, while maintaining a minimal average decrease of just 1.73% in clean accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15968v1-abstract-full').style.display = 'none'; document.getElementById('2409.15968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15690">arXiv:2409.15690</a> <span> [<a href="https://arxiv.org/pdf/2409.15690">pdf</a>, <a href="https://arxiv.org/format/2409.15690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Stance Detection on Social Media: New Directions and Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Genan Dai</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+F">Fuqiang Niu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+N">Nan Yin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaomao Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senzhang Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15690v2-abstract-short" style="display: inline;"> In modern digital environments, users frequently express opinions on contentious topics, providing a wealth of information on prevailing attitudes. The systematic analysis of these opinions offers valuable insights for decision-making in various sectors, including marketing and politics. As a result, stance detection has emerged as a crucial subfield within affective computing, enabling the automa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15690v2-abstract-full').style.display = 'inline'; document.getElementById('2409.15690v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15690v2-abstract-full" style="display: none;"> In modern digital environments, users frequently express opinions on contentious topics, providing a wealth of information on prevailing attitudes. The systematic analysis of these opinions offers valuable insights for decision-making in various sectors, including marketing and politics. As a result, stance detection has emerged as a crucial subfield within affective computing, enabling the automatic detection of user stances in social media conversations and providing a nuanced understanding of public sentiment on complex issues. Recent years have seen a surge of research interest in developing effective stance detection methods, with contributions from multiple communities, including natural language processing, web science, and social computing. This paper provides a comprehensive survey of stance detection techniques on social media, covering task definitions, datasets, approaches, and future works. We review traditional stance detection models, as well as state-of-the-art methods based on large language models, and discuss their strengths and limitations. Our survey highlights the importance of stance detection in understanding public opinion and sentiment, and identifies gaps in current research. We conclude by outlining potential future directions for stance detection on social media, including the need for more robust and generalizable models, and the importance of addressing emerging challenges such as multi-modal stance detection and stance detection in low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15690v2-abstract-full').style.display = 'none'; document.getElementById('2409.15690v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cao%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cao%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>