Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 203 results for author: <span class="mathjax">Lyu, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lyu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lyu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lyu%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lyu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15450">arXiv:2411.15450</a> <span> [<a href="https://arxiv.org/pdf/2411.15450">pdf</a>, <a href="https://arxiv.org/format/2411.15450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Achilles' Heel: Backdoor Watermarking Forgery Attack in Public Dataset Protection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiying Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongjie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+S">Shengda Zhuo</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+G">Guanggang Geng</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+J">Jian Weng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shanxiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaobo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15450v1-abstract-short" style="display: inline;"> High-quality datasets can greatly promote the development of technology. However, dataset construction is expensive and time-consuming, and public datasets are easily exploited by opportunists who are greedy for quick gains, which seriously infringes the rights and interests of dataset owners. At present, backdoor watermarks redefine dataset protection as proof of ownership and become a popular me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15450v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15450v1-abstract-full" style="display: none;"> High-quality datasets can greatly promote the development of technology. However, dataset construction is expensive and time-consuming, and public datasets are easily exploited by opportunists who are greedy for quick gains, which seriously infringes the rights and interests of dataset owners. At present, backdoor watermarks redefine dataset protection as proof of ownership and become a popular method to protect the copyright of public datasets, which effectively safeguards the rights of owners and promotes the development of open source communities. In this paper, we question the reliability of backdoor watermarks and re-examine them from the perspective of attackers. On the one hand, we refine the process of backdoor watermarks by introducing a third-party judicial agency to enhance its practical applicability in real-world scenarios. On the other hand, by exploring the problem of forgery attacks, we reveal the inherent flaws of the dataset ownership verification process. Specifically, we design a Forgery Watermark Generator (FW-Gen) to generate forged watermarks and define a distillation loss between the original watermark and the forged watermark to transfer the information in the original watermark to the forged watermark. Extensive experiments show that forged watermarks have the same statistical significance as original watermarks in copyright verification tests under various conditions and scenarios, indicating that dataset ownership verification results are insufficient to determine infringement. These findings highlight the unreliability of backdoor watermarking methods for dataset ownership verification and suggest new directions for enhancing methods for protecting public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15450v1-abstract-full').style.display = 'none'; document.getElementById('2411.15450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15124">arXiv:2411.15124</a> <span> [<a href="https://arxiv.org/pdf/2411.15124">pdf</a>, <a href="https://arxiv.org/format/2411.15124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> T脺LU 3: Pushing Frontiers in Open Language Model Post-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shengyi Huang</a>, <a href="/search/cs?searchtype=author&query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&query=Brahman%2C+F">Faeze Brahman</a>, <a href="/search/cs?searchtype=author&query=Miranda%2C+L+J+V">Lester James V. Miranda</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Alisa Liu</a>, <a href="/search/cs?searchtype=author&query=Dziri%2C+N">Nouha Dziri</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shane Lyu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+S">Saumya Malik</a>, <a href="/search/cs?searchtype=author&query=Graf%2C+V">Victoria Graf</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J+D">Jena D. Hwang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiangjiang Yang</a>, <a href="/search/cs?searchtype=author&query=Bras%2C+R+L">Ronan Le Bras</a>, <a href="/search/cs?searchtype=author&query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&query=Wilhelm%2C+C">Chris Wilhelm</a>, <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&query=Dasigi%2C+P">Pradeep Dasigi</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15124v1-abstract-short" style="display: inline;"> Language model post-training is applied to refine behaviors and unlock new skills across a wide range of recent language models, but open recipes for applying these techniques lag behind proprietary ones. The underlying training data and recipes for post-training are simultaneously the most important pieces of the puzzle and the portion with the least transparency. To bridge this gap, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15124v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15124v1-abstract-full" style="display: none;"> Language model post-training is applied to refine behaviors and unlock new skills across a wide range of recent language models, but open recipes for applying these techniques lag behind proprietary ones. The underlying training data and recipes for post-training are simultaneously the most important pieces of the puzzle and the portion with the least transparency. To bridge this gap, we introduce T脺LU 3, a family of fully-open state-of-the-art post-trained models, alongside its data, code, and training recipes, serving as a comprehensive guide for modern post-training techniques. T脺LU 3, which builds on Llama 3.1 base models, achieves results surpassing the instruct versions of Llama 3.1, Qwen 2.5, Mistral, and even closed models such as GPT-4o-mini and Claude 3.5-Haiku. The training algorithms for our models include supervised finetuning (SFT), Direct Preference Optimization (DPO), and a novel method we call Reinforcement Learning with Verifiable Rewards (RLVR). With T脺LU 3, we introduce a multi-task evaluation scheme for post-training recipes with development and unseen evaluations, standard benchmark implementations, and substantial decontamination of existing open datasets on said benchmarks. We conclude with analysis and discussion of training methods that did not reliably improve performance. In addition to the T脺LU 3 model weights and demo, we release the complete recipe -- including datasets for diverse core skills, a robust toolkit for data curation and evaluation, the training code and infrastructure, and, most importantly, a detailed report for reproducing and further adapting the T脺LU 3 approach to more domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15124v1-abstract-full').style.display = 'none'; document.getElementById('2411.15124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09265">arXiv:2411.09265</a> <span> [<a href="https://arxiv.org/pdf/2411.09265">pdf</a>, <a href="https://arxiv.org/format/2411.09265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BEARD: Benchmarking the Adversarial Robustness for Dataset Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenquan Feng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09265v1-abstract-short" style="display: inline;"> Dataset Distillation (DD) is an emerging technique that compresses large-scale datasets into significantly smaller synthesized datasets while preserving high test performance and enabling the efficient training of large models. However, current research primarily focuses on enhancing evaluation accuracy under limited compression ratios, often overlooking critical security concerns such as adversar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09265v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09265v1-abstract-full" style="display: none;"> Dataset Distillation (DD) is an emerging technique that compresses large-scale datasets into significantly smaller synthesized datasets while preserving high test performance and enabling the efficient training of large models. However, current research primarily focuses on enhancing evaluation accuracy under limited compression ratios, often overlooking critical security concerns such as adversarial robustness. A key challenge in evaluating this robustness lies in the complex interactions between distillation methods, model architectures, and adversarial attack strategies, which complicate standardized assessments. To address this, we introduce BEARD, an open and unified benchmark designed to systematically assess the adversarial robustness of DD methods, including DM, IDM, and BACON. BEARD encompasses a variety of adversarial attacks (e.g., FGSM, PGD, C&W) on distilled datasets like CIFAR-10/100 and TinyImageNet. Utilizing an adversarial game framework, it introduces three key metrics: Robustness Ratio (RR), Attack Efficiency Ratio (AE), and Comprehensive Robustness-Efficiency Index (CREI). Our analysis includes unified benchmarks, various Images Per Class (IPC) settings, and the effects of adversarial training. Results are available on the BEARD Leaderboard, along with a library providing model and dataset pools to support reproducible research. Access the code at BEARD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09265v1-abstract-full').style.display = 'none'; document.getElementById('2411.09265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05878">arXiv:2411.05878</a> <span> [<a href="https://arxiv.org/pdf/2411.05878">pdf</a>, <a href="https://arxiv.org/format/2411.05878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Joint-Optimized Unsupervised Adversarial Domain Adaptation in Remote Sensing Segmentation with Prompted Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yiwei He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangbiao Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05878v2-abstract-short" style="display: inline;"> Unsupervised Domain Adaptation for Remote Sensing Semantic Segmentation (UDA-RSSeg) addresses the challenge of adapting a model trained on source domain data to target domain samples, thereby minimizing the need for annotated data across diverse remote sensing scenes. This task presents two principal challenges: (1) severe inconsistencies in feature representation across different remote sensing d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05878v2-abstract-full').style.display = 'inline'; document.getElementById('2411.05878v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05878v2-abstract-full" style="display: none;"> Unsupervised Domain Adaptation for Remote Sensing Semantic Segmentation (UDA-RSSeg) addresses the challenge of adapting a model trained on source domain data to target domain samples, thereby minimizing the need for annotated data across diverse remote sensing scenes. This task presents two principal challenges: (1) severe inconsistencies in feature representation across different remote sensing domains, and (2) a domain gap that emerges due to the representation bias of source domain patterns when translating features to predictive logits. To tackle these issues, we propose a joint-optimized adversarial network incorporating the "Segment Anything Model (SAM) (SAM-JOANet)" for UDA-RSSeg. Our approach integrates SAM to leverage its robust generalized representation capabilities, thereby alleviating feature inconsistencies. We introduce a finetuning decoder designed to convert SAM-Encoder features into predictive logits. Additionally, a feature-level adversarial-based prompted segmentor is employed to generate class-agnostic maps, which guide the finetuning decoder's feature representations. The network is optimized end-to-end, combining the prompted segmentor and the finetuning decoder. Extensive evaluations on benchmark datasets, including ISPRS (Potsdam/Vaihingen) and CITY-OSM (Paris/Chicago), demonstrate the effectiveness of our method. The results, supported by visualization and analysis, confirm the method's interpretability and robustness. The code of this paper is available at https://github.com/CV-ShuchangLyu/SAM-JOANet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05878v2-abstract-full').style.display = 'none'; document.getElementById('2411.05878v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages,6 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00419">arXiv:2411.00419</a> <span> [<a href="https://arxiv.org/pdf/2411.00419">pdf</a>, <a href="https://arxiv.org/format/2411.00419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Argus: Multi-View Egocentric Human Mesh Reconstruction Based on Stripped-Down Wearable mmWave Add-on </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+D">Di Duan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shengzhe Lyu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mu Yuan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianxing Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weitao Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kaishun Wu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+G">Guoliang Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00419v1-abstract-short" style="display: inline;"> In this paper, we propose Argus, a wearable add-on system based on stripped-down (i.e., compact, lightweight, low-power, limited-capability) mmWave radars. It is the first to achieve egocentric human mesh reconstruction in a multi-view manner. Compared with conventional frontal-view mmWave sensing solutions, it addresses several pain points, such as restricted sensing range, occlusion, and the mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00419v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00419v1-abstract-full" style="display: none;"> In this paper, we propose Argus, a wearable add-on system based on stripped-down (i.e., compact, lightweight, low-power, limited-capability) mmWave radars. It is the first to achieve egocentric human mesh reconstruction in a multi-view manner. Compared with conventional frontal-view mmWave sensing solutions, it addresses several pain points, such as restricted sensing range, occlusion, and the multipath effect caused by surroundings. To overcome the limited capabilities of the stripped-down mmWave radars (with only one transmit antenna and three receive antennas), we tackle three main challenges and propose a holistic solution, including tailored hardware design, sophisticated signal processing, and a deep neural network optimized for high-dimensional complex point clouds. Extensive evaluation shows that Argus achieves performance comparable to traditional solutions based on high-capability mmWave radars, with an average vertex error of 6.5 cm, solely using stripped-down radars deployed in a multi-view configuration. It presents robustness and practicality across conditions, such as with unseen users and different host devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00419v1-abstract-full').style.display = 'none'; document.getElementById('2411.00419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 25 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18894">arXiv:2410.18894</a> <span> [<a href="https://arxiv.org/pdf/2410.18894">pdf</a>, <a href="https://arxiv.org/format/2410.18894">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Meta-Learning with Heterogeneous Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Si%2C+Z">Zhaofeng Si</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+K">Kaiyi Ji</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18894v1-abstract-short" style="display: inline;"> Meta-learning is a general approach to equip machine learning models with the ability to handle few-shot scenarios when dealing with many tasks. Most existing meta-learning methods work based on the assumption that all tasks are of equal importance. However, real-world applications often present heterogeneous tasks characterized by varying difficulty levels, noise in training samples, or being dis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18894v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18894v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18894v1-abstract-full" style="display: none;"> Meta-learning is a general approach to equip machine learning models with the ability to handle few-shot scenarios when dealing with many tasks. Most existing meta-learning methods work based on the assumption that all tasks are of equal importance. However, real-world applications often present heterogeneous tasks characterized by varying difficulty levels, noise in training samples, or being distinctively different from most other tasks. In this paper, we introduce a novel meta-learning method designed to effectively manage such heterogeneous tasks by employing rank-based task-level learning objectives, Heterogeneous Tasks Robust Meta-learning (HeTRoM). HeTRoM is proficient in handling heterogeneous tasks, and it prevents easy tasks from overwhelming the meta-learner. The approach allows for an efficient iterative optimization algorithm based on bi-level optimization, which is then improved by integrating statistical guidance. Our experimental results demonstrate that our method provides flexibility, enabling users to adapt to diverse task settings and enhancing the meta-learner's overall performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18894v1-abstract-full').style.display = 'none'; document.getElementById('2410.18894v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15318">arXiv:2410.15318</a> <span> [<a href="https://arxiv.org/pdf/2410.15318">pdf</a>, <a href="https://arxiv.org/format/2410.15318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SNAP: Stopping Catastrophic Forgetting in Hebbian Learning with Sigmoidal Neuronal Adaptive Plasticity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+P">Patrick Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiyan Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Sicheng Lyu</a>, <a href="/search/cs?searchtype=author&query=Pr%C3%A9mont-Schwarz%2C+I">Isabeau Pr茅mont-Schwarz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15318v1-abstract-short" style="display: inline;"> Artificial Neural Networks (ANNs) suffer from catastrophic forgetting, where the learning of new tasks causes the catastrophic forgetting of old tasks. Existing Machine Learning (ML) algorithms, including those using Stochastic Gradient Descent (SGD) and Hebbian Learning typically update their weights linearly with experience i.e., independently of their current strength. This contrasts with biolo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15318v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15318v1-abstract-full" style="display: none;"> Artificial Neural Networks (ANNs) suffer from catastrophic forgetting, where the learning of new tasks causes the catastrophic forgetting of old tasks. Existing Machine Learning (ML) algorithms, including those using Stochastic Gradient Descent (SGD) and Hebbian Learning typically update their weights linearly with experience i.e., independently of their current strength. This contrasts with biological neurons, which at intermediate strengths are very plastic, but consolidate with Long-Term Potentiation (LTP) once they reach a certain strength. We hypothesize this mechanism might help mitigate catastrophic forgetting. We introduce Sigmoidal Neuronal Adaptive Plasticity (SNAP) an artificial approximation to Long-Term Potentiation for ANNs by having the weights follow a sigmoidal growth behaviour allowing the weights to consolidate and stabilize when they reach sufficiently large or small values. We then compare SNAP to linear weight growth and exponential weight growth and see that SNAP completely prevents the forgetting of previous tasks for Hebbian Learning but not for SGD-base learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15318v1-abstract-full').style.display = 'none'; document.getElementById('2410.15318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 11 figures, accepted at Montr茅al AI and Neuroscience (MAIN) 2024 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11502">arXiv:2410.11502</a> <span> [<a href="https://arxiv.org/pdf/2410.11502">pdf</a>, <a href="https://arxiv.org/format/2410.11502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Offline Model-Based Optimization by Learning to Rank </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+R">Rong-Xi Tan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+K">Ke Xue</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shen-Huan Lyu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+H">Haopu Shang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+S">Sheng Fu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chao Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11502v1-abstract-short" style="display: inline;"> Offline model-based optimization (MBO) aims to identify a design that maximizes a black-box function using only a fixed, pre-collected dataset of designs and their corresponding scores. A common approach in offline MBO is to train a regression-based surrogate model by minimizing mean squared error (MSE) and then find the best design within this surrogate model by different optimizers (e.g., gradie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11502v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11502v1-abstract-full" style="display: none;"> Offline model-based optimization (MBO) aims to identify a design that maximizes a black-box function using only a fixed, pre-collected dataset of designs and their corresponding scores. A common approach in offline MBO is to train a regression-based surrogate model by minimizing mean squared error (MSE) and then find the best design within this surrogate model by different optimizers (e.g., gradient ascent). However, a critical challenge is the risk of out-of-distribution errors, i.e., the surrogate model may typically overestimate the scores and mislead the optimizers into suboptimal regions. Prior works have attempted to address this issue in various ways, such as using regularization techniques and ensemble learning to enhance the robustness of the model, but it still remains. In this paper, we argue that regression models trained with MSE are not well-aligned with the primary goal of offline MBO, which is to select promising designs rather than to predict their scores precisely. Notably, if a surrogate model can maintain the order of candidate designs based on their relative score relationships, it can produce the best designs even without precise predictions. To validate it, we conduct experiments to compare the relationship between the quality of the final designs and MSE, finding that the correlation is really very weak. In contrast, a metric that measures order-maintaining quality shows a significantly stronger correlation. Based on this observation, we propose learning a ranking-based model that leverages learning to rank techniques to prioritize promising designs based on their relative scores. We show that the generalization error on ranking loss can be well bounded. Empirical results across diverse tasks demonstrate the superior performance of our proposed ranking-based models than twenty existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11502v1-abstract-full').style.display = 'none'; document.getElementById('2410.11502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06126">arXiv:2410.06126</a> <span> [<a href="https://arxiv.org/pdf/2410.06126">pdf</a>, <a href="https://arxiv.org/format/2410.06126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> $\textit{X}^2$-DFD: A framework for e${X}$plainable and e${X}$tendable Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yize Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06126v1-abstract-short" style="display: inline;"> Detecting deepfakes has become an important task. Most existing detection methods provide only real/fake predictions without offering human-comprehensible explanations. Recent studies leveraging MLLMs for deepfake detection have shown improvements in explainability. However, the performance of pre-trained MLLMs (e.g., LLaVA) remains limited due to a lack of understanding of their capabilities for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06126v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06126v1-abstract-full" style="display: none;"> Detecting deepfakes has become an important task. Most existing detection methods provide only real/fake predictions without offering human-comprehensible explanations. Recent studies leveraging MLLMs for deepfake detection have shown improvements in explainability. However, the performance of pre-trained MLLMs (e.g., LLaVA) remains limited due to a lack of understanding of their capabilities for this task and strategies to enhance them. In this work, we empirically assess the strengths and weaknesses of MLLMs specifically in deepfake detection via forgery features analysis. Building on these assessments, we propose a novel framework called ${X}^2$-DFD, consisting of three core modules. The first module, Model Feature Assessment (MFA), measures the detection capabilities of forgery features intrinsic to MLLMs, and gives a descending ranking of these features. The second module, Strong Feature Strengthening (SFS), enhances the detection and explanation capabilities by fine-tuning the MLLM on a dataset constructed based on the top-ranked features. The third module, Weak Feature Supplementing (WFS), improves the fine-tuned MLLM's capabilities on lower-ranked features by integrating external dedicated deepfake detectors. To verify the effectiveness of this framework, we further present a practical implementation, where an automated forgery features generation, evaluation, and ranking procedure is designed for MFA module; an automated generation procedure of the fine-tuning dataset containing real and fake images with explanations based on top-ranked features is developed for SFS model; an external conventional deepfake detector focusing on blending artifact, which corresponds to a low detection capability in the pre-trained MLLM, is integrated for WFS module. Experiments show that our approach enhances both detection and explanation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06126v1-abstract-full').style.display = 'none'; document.getElementById('2410.06126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19681">arXiv:2409.19681</a> <span> [<a href="https://arxiv.org/pdf/2409.19681">pdf</a>, <a href="https://arxiv.org/format/2409.19681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Simple and Fast Distillation of Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhenyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19681v1-abstract-short" style="display: inline;"> Diffusion-based generative models have demonstrated their powerful performance across various tasks, but this comes at a cost of the slow sampling speed. To achieve both efficient and high-quality synthesis, various distillation-based accelerated sampling methods have been developed recently. However, they generally require time-consuming fine tuning with elaborate designs to achieve satisfactory… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19681v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19681v1-abstract-full" style="display: none;"> Diffusion-based generative models have demonstrated their powerful performance across various tasks, but this comes at a cost of the slow sampling speed. To achieve both efficient and high-quality synthesis, various distillation-based accelerated sampling methods have been developed recently. However, they generally require time-consuming fine tuning with elaborate designs to achieve satisfactory performance in a specific number of function evaluation (NFE), making them difficult to employ in practice. To address this issue, we propose Simple and Fast Distillation (SFD) of diffusion models, which simplifies the paradigm used in existing methods and largely shortens their fine-tuning time up to 1000$\times$. We begin with a vanilla distillation-based sampling method and boost its performance to state of the art by identifying and addressing several small yet vital factors affecting the synthesis efficiency and quality. Our method can also achieve sampling with variable NFEs using a single distilled model. Extensive experiments demonstrate that SFD strikes a good balance between the sample quality and fine-tuning costs in few-step image generation task. For example, SFD achieves 4.53 FID (NFE=2) on CIFAR-10 with only 0.64 hours of fine-tuning on a single NVIDIA A100 GPU. Our code is available at https://github.com/zju-pi/diff-sampler. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19681v1-abstract-full').style.display = 'none'; document.getElementById('2409.19681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19365">arXiv:2409.19365</a> <span> [<a href="https://arxiv.org/pdf/2409.19365">pdf</a>, <a href="https://arxiv.org/format/2409.19365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Conditional Image Synthesis with Diffusion Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Zheyuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defang Chen</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jian-Ping Mei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhenghe Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19365v2-abstract-short" style="display: inline;"> Conditional image synthesis based on user-specified requirements is a key component in creating complex visual content. In recent years, diffusion-based generative modeling has become a highly effective way for conditional image synthesis, leading to exponential growth in the literature. However, the complexity of diffusion-based modeling, the wide range of image synthesis tasks, and the diversity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19365v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19365v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19365v2-abstract-full" style="display: none;"> Conditional image synthesis based on user-specified requirements is a key component in creating complex visual content. In recent years, diffusion-based generative modeling has become a highly effective way for conditional image synthesis, leading to exponential growth in the literature. However, the complexity of diffusion-based modeling, the wide range of image synthesis tasks, and the diversity of conditioning mechanisms present significant challenges for researchers to keep up with rapid developments and understand the core concepts on this topic. In this survey, we categorize existing works based on how conditions are integrated into the two fundamental components of diffusion-based modeling, i.e., the denoising network and the sampling process. We specifically highlight the underlying principles, advantages, and potential challenges of various conditioning approaches in the training, re-purposing, and specialization stages to construct a desired denoising network. We also summarize six mainstream conditioning mechanisms in the essential sampling process. All discussions are centered around popular applications. Finally, we pinpoint some critical yet still open problems to be solved in the future and suggest some possible solutions. Our reviewed works are itemized at https://github.com/zju-pi/Awesome-Conditional-Diffusion-Models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19365v2-abstract-full').style.display = 'none'; document.getElementById('2409.19365v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09638">arXiv:2409.09638</a> <span> [<a href="https://arxiv.org/pdf/2409.09638">pdf</a>, <a href="https://arxiv.org/format/2409.09638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multi-view Hypergraph-based Contrastive Learning Model for Cold-Start Micro-video Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Sisuo Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiuze Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09638v1-abstract-short" style="display: inline;"> With the widespread use of mobile devices and the rapid growth of micro-video platforms such as TikTok and Kwai, the demand for personalized micro-video recommendation systems has significantly increased. Micro-videos typically contain diverse information, such as textual metadata, visual cues (e.g., cover images), and dynamic video content, significantly affecting user interaction and engagement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09638v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09638v1-abstract-full" style="display: none;"> With the widespread use of mobile devices and the rapid growth of micro-video platforms such as TikTok and Kwai, the demand for personalized micro-video recommendation systems has significantly increased. Micro-videos typically contain diverse information, such as textual metadata, visual cues (e.g., cover images), and dynamic video content, significantly affecting user interaction and engagement patterns. However, most existing approaches often suffer from the problem of over-smoothing, which limits their ability to capture comprehensive interaction information effectively. Additionally, cold-start scenarios present ongoing challenges due to sparse interaction data and the underutilization of available interaction signals. To address these issues, we propose a Multi-view Hypergraph-based Contrastive learning model for cold-start micro-video Recommendation (MHCR). MHCR introduces a multi-view multimodal feature extraction layer to capture interaction signals from various perspectives and incorporates multi-view self-supervised learning tasks to provide additional supervisory signals. Through extensive experiments on two real-world datasets, we show that MHCR significantly outperforms existing video recommendation models and effectively mitigates cold-start challenges. Our code is available at https://anonymous.4open.science/r/MHCR-02EF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09638v1-abstract-full').style.display = 'none'; document.getElementById('2409.09638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16305">arXiv:2408.16305</a> <span> [<a href="https://arxiv.org/pdf/2408.16305">pdf</a>, <a href="https://arxiv.org/format/2408.16305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantics-Oriented Multitask Learning for DeepFake Detection: A Joint Embedding Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+M">Mian Zou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Baosheng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yibing Zhan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kede Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16305v1-abstract-short" style="display: inline;"> In recent years, the multimedia forensics and security community has seen remarkable progress in multitask learning for DeepFake (i.e., face forgery) detection. The prevailing strategy has been to frame DeepFake detection as a binary classification problem augmented by manipulation-oriented auxiliary tasks. This strategy focuses on learning features specific to face manipulations, which exhibit li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16305v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16305v1-abstract-full" style="display: none;"> In recent years, the multimedia forensics and security community has seen remarkable progress in multitask learning for DeepFake (i.e., face forgery) detection. The prevailing strategy has been to frame DeepFake detection as a binary classification problem augmented by manipulation-oriented auxiliary tasks. This strategy focuses on learning features specific to face manipulations, which exhibit limited generalizability. In this paper, we delve deeper into semantics-oriented multitask learning for DeepFake detection, leveraging the relationships among face semantics via joint embedding. We first propose an automatic dataset expansion technique that broadens current face forgery datasets to support semantics-oriented DeepFake detection tasks at both the global face attribute and local face region levels. Furthermore, we resort to joint embedding of face images and their corresponding labels (depicted by textual descriptions) for prediction. This approach eliminates the need for manually setting task-agnostic and task-specific parameters typically required when predicting labels directly from images. In addition, we employ a bi-level optimization strategy to dynamically balance the fidelity loss weightings of various tasks, making the training process fully automated. Extensive experiments on six DeepFake datasets show that our method improves the generalizability of DeepFake detection and, meanwhile, renders some degree of model interpretation by providing human-understandable explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16305v1-abstract-full').style.display = 'none'; document.getElementById('2408.16305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13787">arXiv:2408.13787</a> <span> [<a href="https://arxiv.org/pdf/2408.13787">pdf</a>, <a href="https://arxiv.org/format/2408.13787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Mask-Encoded Sparsification: Mitigating Biased Gradients in Communication-Efficient Split Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Z">Zhihao Qu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shen-Huan Lyu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+M">Miao Cai</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+B">Baoliu Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13787v3-abstract-short" style="display: inline;"> This paper introduces a novel framework designed to achieve a high compression ratio in Split Learning (SL) scenarios where resource-constrained devices are involved in large-scale model training. Our investigations demonstrate that compressing feature maps within SL leads to biased gradients that can negatively impact the convergence rates and diminish the generalization capabilities of the resul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13787v3-abstract-full').style.display = 'inline'; document.getElementById('2408.13787v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13787v3-abstract-full" style="display: none;"> This paper introduces a novel framework designed to achieve a high compression ratio in Split Learning (SL) scenarios where resource-constrained devices are involved in large-scale model training. Our investigations demonstrate that compressing feature maps within SL leads to biased gradients that can negatively impact the convergence rates and diminish the generalization capabilities of the resulting models. Our theoretical analysis provides insights into how compression errors critically hinder SL performance, which previous methodologies underestimate. To address these challenges, we employ a narrow bit-width encoded mask to compensate for the sparsification error without increasing the order of time complexity. Supported by rigorous theoretical analysis, our framework significantly reduces compression errors and accelerates the convergence. Extensive experiments also verify that our method outperforms existing solutions regarding training efficiency and communication complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13787v3-abstract-full').style.display = 'none'; document.getElementById('2408.13787v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 27th European Conference on Artificial Intelligence, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07703">arXiv:2408.07703</a> <span> [<a href="https://arxiv.org/pdf/2408.07703">pdf</a>, <a href="https://arxiv.org/format/2408.07703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Distillation with Refined Logits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wujie Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defang Chen</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Genlang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07703v2-abstract-short" style="display: inline;"> Recent research on knowledge distillation has increasingly focused on logit distillation because of its simplicity, effectiveness, and versatility in model compression. In this paper, we introduce Refined Logit Distillation (RLD) to address the limitations of current logit distillation methods. Our approach is motivated by the observation that even high-performing teacher models can make incorrect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07703v2-abstract-full').style.display = 'inline'; document.getElementById('2408.07703v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07703v2-abstract-full" style="display: none;"> Recent research on knowledge distillation has increasingly focused on logit distillation because of its simplicity, effectiveness, and versatility in model compression. In this paper, we introduce Refined Logit Distillation (RLD) to address the limitations of current logit distillation methods. Our approach is motivated by the observation that even high-performing teacher models can make incorrect predictions, creating a conflict between the standard distillation loss and the cross-entropy loss. This conflict can undermine the consistency of the student model's learning objectives. Previous attempts to use labels to empirically correct teacher predictions may undermine the class correlation. In contrast, our RLD employs labeling information to dynamically refine teacher logits. In this way, our method can effectively eliminate misleading information from the teacher while preserving crucial class correlations, thus enhancing the value and efficiency of distilled knowledge. Experimental results on CIFAR-100 and ImageNet demonstrate its superiority over existing methods. The code is provided at \text{https://github.com/zju-SWJ/RLD}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07703v2-abstract-full').style.display = 'none'; document.getElementById('2408.07703v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04300">arXiv:2408.04300</a> <span> [<a href="https://arxiv.org/pdf/2408.04300">pdf</a>, <a href="https://arxiv.org/format/2408.04300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Explainable Non-local Network for COVID-19 Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingfu Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jing Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jun Guo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04300v1-abstract-short" style="display: inline;"> The CNN has achieved excellent results in the automatic classification of medical images. In this study, we propose a novel deep residual 3D attention non-local network (NL-RAN) to classify CT images included COVID-19, common pneumonia, and normal to perform rapid and explainable COVID-19 diagnosis. We built a deep residual 3D attention non-local network that could achieve end-to-end training. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04300v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04300v1-abstract-full" style="display: none;"> The CNN has achieved excellent results in the automatic classification of medical images. In this study, we propose a novel deep residual 3D attention non-local network (NL-RAN) to classify CT images included COVID-19, common pneumonia, and normal to perform rapid and explainable COVID-19 diagnosis. We built a deep residual 3D attention non-local network that could achieve end-to-end training. The network is embedded with a nonlocal module to capture global information, while a 3D attention module is embedded to focus on the details of the lesion so that it can directly analyze the 3D lung CT and output the classification results. The output of the attention module can be used as a heat map to increase the interpretability of the model. 4079 3D CT scans were included in this study. Each scan had a unique label (novel coronavirus pneumonia, common pneumonia, and normal). The CT scans cohort was randomly split into a training set of 3263 scans, a validation set of 408 scans, and a testing set of 408 scans. And compare with existing mainstream classification methods, such as CovNet, CBAM, ResNet, etc. Simultaneously compare the visualization results with visualization methods such as CAM. Model performance was evaluated using the Area Under the ROC Curve(AUC), precision, and F1-score. The NL-RAN achieved the AUC of 0.9903, the precision of 0.9473, and the F1-score of 0.9462, surpass all the classification methods compared. The heat map output by the attention module is also clearer than the heat map output by CAM. Our experimental results indicate that our proposed method performs significantly better than existing methods. In addition, the first attention module outputs a heat map containing detailed outline information to increase the interpretability of the model. Our experiments indicate that the inference of our model is fast. It can provide real-time assistance with diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04300v1-abstract-full').style.display = 'none'; document.getElementById('2408.04300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02191">arXiv:2408.02191</a> <span> [<a href="https://arxiv.org/pdf/2408.02191">pdf</a>, <a href="https://arxiv.org/format/2408.02191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dense Feature Interaction Network for Image Inpainting Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Ye Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tingfeng Han</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02191v1-abstract-short" style="display: inline;"> Image inpainting, which is the task of filling in missing areas in an image, is a common image editing technique. Inpainting can be used to conceal or alter image contents in malicious manipulation of images, driving the need for research in image inpainting detection. Existing methods mostly rely on a basic encoder-decoder structure, which often results in a high number of false positives or miss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02191v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02191v1-abstract-full" style="display: none;"> Image inpainting, which is the task of filling in missing areas in an image, is a common image editing technique. Inpainting can be used to conceal or alter image contents in malicious manipulation of images, driving the need for research in image inpainting detection. Existing methods mostly rely on a basic encoder-decoder structure, which often results in a high number of false positives or misses the inpainted regions, especially when dealing with targets of varying semantics and scales. Additionally, the absence of an effective approach to capture boundary artifacts leads to less accurate edge localization. In this paper, we describe a new method for inpainting detection based on a Dense Feature Interaction Network (DeFI-Net). DeFI-Net uses a novel feature pyramid architecture to capture and amplify multi-scale representations across various stages, thereby improving the detection of image inpainting by better revealing feature-level interactions. Additionally, the network can adaptively direct the lower-level features, which carry edge and shape information, to refine the localization of manipulated regions while integrating the higher-level semantic features. Using DeFI-Net, we develop a method combining complementary representations to accurately identify inpainted areas. Evaluation on five image inpainting datasets demonstrate the effectiveness of our approach, which achieves state-of-the-art performance in detecting inpainting across diverse models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02191v1-abstract-full').style.display = 'none'; document.getElementById('2408.02191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21788">arXiv:2407.21788</a> <span> [<a href="https://arxiv.org/pdf/2407.21788">pdf</a>, <a href="https://arxiv.org/format/2407.21788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Vision-Language Model Based Handwriting Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chauhan%2C+M">Mihir Chauhan</a>, <a href="/search/cs?searchtype=author&query=Satbhai%2C+A">Abhishek Satbhai</a>, <a href="/search/cs?searchtype=author&query=Hashemi%2C+M+A">Mohammad Abuzar Hashemi</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+M+B">Mir Basheer Ali</a>, <a href="/search/cs?searchtype=author&query=Ramamurthy%2C+B">Bina Ramamurthy</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingchen Gao</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Srihari%2C+S">Sargur Srihari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21788v1-abstract-short" style="display: inline;"> Handwriting Verification is a critical in document forensics. Deep learning based approaches often face skepticism from forensic document examiners due to their lack of explainability and reliance on extensive training data and handcrafted features. This paper explores using Vision Language Models (VLMs), such as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By leveraging th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21788v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21788v1-abstract-full" style="display: none;"> Handwriting Verification is a critical in document forensics. Deep learning based approaches often face skepticism from forensic document examiners due to their lack of explainability and reliance on extensive training data and handcrafted features. This paper explores using Vision Language Models (VLMs), such as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By leveraging their Visual Question Answering capabilities and 0-shot Chain-of-Thought (CoT) reasoning, our goal is to provide clear, human-understandable explanations for model decisions. Our experiments on the CEDAR handwriting dataset demonstrate that VLMs offer enhanced interpretability, reduce the need for large training datasets, and adapt better to diverse handwriting styles. However, results show that the CNN-based ResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach with GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy: 71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings highlight the potential of VLMs in generating human-interpretable decisions while underscoring the need for further advancements to match the performance of specialized deep learning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21788v1-abstract-full').style.display = 'none'; document.getElementById('2407.21788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine Vision and Image Processing (IMVIP) Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05108">arXiv:2407.05108</a> <span> [<a href="https://arxiv.org/pdf/2407.05108">pdf</a>, <a href="https://arxiv.org/format/2407.05108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Role of Depth, Width, and Tree Size in Expressiveness of Deep Forest </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shen-Huan Lyu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jin-Hui Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qin-Cheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+B">Baoliu Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05108v1-abstract-short" style="display: inline;"> Random forests are classical ensemble algorithms that construct multiple randomized decision trees and aggregate their predictions using naive averaging. \citet{zhou2019deep} further propose a deep forest algorithm with multi-layer forests, which outperforms random forests in various tasks. The performance of deep forests is related to three hyperparameters in practice: depth, width, and tree size… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05108v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05108v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05108v1-abstract-full" style="display: none;"> Random forests are classical ensemble algorithms that construct multiple randomized decision trees and aggregate their predictions using naive averaging. \citet{zhou2019deep} further propose a deep forest algorithm with multi-layer forests, which outperforms random forests in various tasks. The performance of deep forests is related to three hyperparameters in practice: depth, width, and tree size, but little has been known about its theoretical explanation. This work provides the first upper and lower bounds on the approximation complexity of deep forests concerning the three hyperparameters. Our results confirm the distinctive role of depth, which can exponentially enhance the expressiveness of deep forests compared with width and tree size. Experiments confirm the theoretical findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05108v1-abstract-full').style.display = 'none'; document.getElementById('2407.05108v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In: Proceedings of the 27th European Conference on Artificial Intelligence, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03107">arXiv:2407.03107</a> <span> [<a href="https://arxiv.org/pdf/2407.03107">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Design of a UE5-based digital twin platform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shaoqiu Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Muzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sunrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengzhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03107v1-abstract-short" style="display: inline;"> Aiming at the current mainstream 3D scene engine learning and building cost is too high, this thesis proposes a digital twin platform design program based on Unreal Engine 5 (UE5). It aims to provide a universal platform construction design process to effectively reduce the learning cost of large-scale scene construction. Taking an actual project of a unit as an example, the overall cycle work of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03107v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03107v1-abstract-full" style="display: none;"> Aiming at the current mainstream 3D scene engine learning and building cost is too high, this thesis proposes a digital twin platform design program based on Unreal Engine 5 (UE5). It aims to provide a universal platform construction design process to effectively reduce the learning cost of large-scale scene construction. Taking an actual project of a unit as an example, the overall cycle work of platform building is explained, and the digital twin and data visualization technologies and applications based on UE5 are analyzed. By summarizing the project implementation into a process approach, the standardization and operability of the process pathway is improved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03107v1-abstract-full').style.display = 'none'; document.getElementById('2407.03107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16943">arXiv:2406.16943</a> <span> [<a href="https://arxiv.org/pdf/2406.16943">pdf</a>, <a href="https://arxiv.org/format/2406.16943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/CSCAIoT62585.2024.00005">10.1109/CSCAIoT62585.2024.00005 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> EarDA: Towards Accurate and Data-Efficient Earable Activity Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shengzhe Lyu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongliang Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+D">Di Duan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+R">Renqi Jia</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weitao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16943v1-abstract-short" style="display: inline;"> In the realm of smart sensing with the Internet of Things, earable devices are empowered with the capability of multi-modality sensing and intelligence of context-aware computing, leading to its wide usage in Human Activity Recognition (HAR). Nonetheless, unlike the movements captured by Inertial Measurement Unit (IMU) sensors placed on the upper or lower body, those motion signals obtained from e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16943v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16943v1-abstract-full" style="display: none;"> In the realm of smart sensing with the Internet of Things, earable devices are empowered with the capability of multi-modality sensing and intelligence of context-aware computing, leading to its wide usage in Human Activity Recognition (HAR). Nonetheless, unlike the movements captured by Inertial Measurement Unit (IMU) sensors placed on the upper or lower body, those motion signals obtained from earable devices show significant changes in amplitudes and patterns, especially in the presence of dynamic and unpredictable head movements, posing a significant challenge for activity classification. In this work, we present EarDA, an adversarial-based domain adaptation system to extract the domain-independent features across different sensor locations. Moreover, while most deep learning methods commonly rely on training with substantial amounts of labeled data to offer good accuracy, the proposed scheme can release the potential usage of publicly available smartphone-based IMU datasets. Furthermore, we explore the feasibility of applying a filter-based data processing method to mitigate the impact of head movement. EarDA, the proposed system, enables more data-efficient and accurate activity sensing. It achieves an accuracy of 88.8% under HAR task, demonstrating a significant 43% improvement over methods without domain adaptation. This clearly showcases its effectiveness in mitigating domain gaps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16943v1-abstract-full').style.display = 'none'; document.getElementById('2406.16943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by 2024 IEEE Coupling of Sensing & Computing in AIoT Systems (CSCAIoT)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10427">arXiv:2406.10427</a> <span> [<a href="https://arxiv.org/pdf/2406.10427">pdf</a>, <a href="https://arxiv.org/format/2406.10427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Randomized Smoothing: Certified Adversarial Robustness for Multi-Step Defences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Saiyue Lyu</a>, <a href="/search/cs?searchtype=author&query=Shaikh%2C+S">Shadab Shaikh</a>, <a href="/search/cs?searchtype=author&query=Shpilevskiy%2C+F">Frederick Shpilevskiy</a>, <a href="/search/cs?searchtype=author&query=Shelhamer%2C+E">Evan Shelhamer</a>, <a href="/search/cs?searchtype=author&query=L%C3%A9cuyer%2C+M">Mathias L茅cuyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10427v2-abstract-short" style="display: inline;"> We propose Adaptive Randomized Smoothing (ARS) to certify the predictions of our test-time adaptive models against adversarial examples. ARS extends the analysis of randomized smoothing using $f$-Differential Privacy to certify the adaptive composition of multiple steps. For the first time, our theory covers the sound adaptive composition of general and high-dimensional functions of noisy inputs.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10427v2-abstract-full').style.display = 'inline'; document.getElementById('2406.10427v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10427v2-abstract-full" style="display: none;"> We propose Adaptive Randomized Smoothing (ARS) to certify the predictions of our test-time adaptive models against adversarial examples. ARS extends the analysis of randomized smoothing using $f$-Differential Privacy to certify the adaptive composition of multiple steps. For the first time, our theory covers the sound adaptive composition of general and high-dimensional functions of noisy inputs. We instantiate ARS on deep image classification to certify predictions against adversarial examples of bounded $L_{\infty}$ norm. In the $L_{\infty}$ threat model, ARS enables flexible adaptation through high-dimensional input-dependent masking. We design adaptivity benchmarks, based on CIFAR-10 and CelebA, and show that ARS improves standard test accuracy by $1$ to $15\%$ points. On ImageNet, ARS improves certified test accuracy by up to $1.6\%$ points over standard RS without adaptivity. Our code is available at https://github.com/ubc-systopia/adaptive-randomized-smoothing . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10427v2-abstract-full').style.display = 'none'; document.getElementById('2406.10427v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04745">arXiv:2406.04745</a> <span> [<a href="https://arxiv.org/pdf/2406.04745">pdf</a>, <a href="https://arxiv.org/format/2406.04745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Confidence-aware Contrastive Learning for Selective Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu-Chang Wu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shen-Huan Lyu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+H">Haopu Shang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangyu Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chao Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04745v1-abstract-short" style="display: inline;"> Selective classification enables models to make predictions only when they are sufficiently confident, aiming to enhance safety and reliability, which is important in high-stakes scenarios. Previous methods mainly use deep neural networks and focus on modifying the architecture of classification layers to enable the model to estimate the confidence of its prediction. This work provides a generaliz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04745v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04745v1-abstract-full" style="display: none;"> Selective classification enables models to make predictions only when they are sufficiently confident, aiming to enhance safety and reliability, which is important in high-stakes scenarios. Previous methods mainly use deep neural networks and focus on modifying the architecture of classification layers to enable the model to estimate the confidence of its prediction. This work provides a generalization bound for selective classification, disclosing that optimizing feature layers helps improve the performance of selective classification. Inspired by this theory, we propose to explicitly improve the selective classification model at the feature level for the first time, leading to a novel Confidence-aware Contrastive Learning method for Selective Classification, CCL-SC, which similarizes the features of homogeneous instances and differentiates the features of heterogeneous instances, with the strength controlled by the model's confidence. The experimental results on typical datasets, i.e., CIFAR-10, CIFAR-100, CelebA, and ImageNet, show that CCL-SC achieves significantly lower selective risk than state-of-the-art methods, across almost all coverage degrees. Moreover, it can be combined with existing methods to bring further improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04745v1-abstract-full').style.display = 'none'; document.getElementById('2406.04745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01112">arXiv:2406.01112</a> <span> [<a href="https://arxiv.org/pdf/2406.01112">pdf</a>, <a href="https://arxiv.org/format/2406.01112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BACON: Bayesian Optimal Condensation Framework for Dataset Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenquan Feng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01112v1-abstract-short" style="display: inline;"> Dataset Distillation (DD) aims to distill knowledge from extensive datasets into more compact ones while preserving performance on the test set, thereby reducing storage costs and training expenses. However, existing methods often suffer from computational intensity, particularly exhibiting suboptimal performance with large dataset sizes due to the lack of a robust theoretical framework for analyz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01112v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01112v1-abstract-full" style="display: none;"> Dataset Distillation (DD) aims to distill knowledge from extensive datasets into more compact ones while preserving performance on the test set, thereby reducing storage costs and training expenses. However, existing methods often suffer from computational intensity, particularly exhibiting suboptimal performance with large dataset sizes due to the lack of a robust theoretical framework for analyzing the DD problem. To address these challenges, we propose the BAyesian optimal CONdensation framework (BACON), which is the first work to introduce the Bayesian theoretical framework to the literature of DD. This framework provides theoretical support for enhancing the performance of DD. Furthermore, BACON formulates the DD problem as the minimization of the expected risk function in joint probability distributions using the Bayesian framework. Additionally, by analyzing the expected risk function for optimal condensation, we derive a numerically feasible lower bound based on specific assumptions, providing an approximate solution for BACON. We validate BACON across several datasets, demonstrating its superior performance compared to existing state-of-the-art methods. For instance, under the IPC-10 setting, BACON achieves a 3.46% accuracy gain over the IDM method on the CIFAR-10 dataset and a 3.10% gain on the TinyImageNet dataset. Our extensive experiments confirm the effectiveness of BACON and its seamless integration with existing methods, thereby enhancing their performance for the DD task. Code and distilled datasets are available at BACON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01112v1-abstract-full').style.display = 'none'; document.getElementById('2406.01112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00985">arXiv:2406.00985</a> <span> [<a href="https://arxiv.org/pdf/2406.00985">pdf</a>, <a href="https://arxiv.org/format/2406.00985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ParallelEdits: Efficient Multi-Aspect Text-Driven Image Editing with Attention Grouping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mingzhen Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jialing Cai</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Lokhande%2C+V+S">Vishnu Suresh Lokhande</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00985v3-abstract-short" style="display: inline;"> Text-driven image synthesis has made significant advancements with the development of diffusion models, transforming how visual content is generated from text prompts. Despite these advances, text-driven image editing, a key area in computer graphics, faces unique challenges. A major challenge is making simultaneous edits across multiple objects or attributes. Applying these methods sequentially f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00985v3-abstract-full').style.display = 'inline'; document.getElementById('2406.00985v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00985v3-abstract-full" style="display: none;"> Text-driven image synthesis has made significant advancements with the development of diffusion models, transforming how visual content is generated from text prompts. Despite these advances, text-driven image editing, a key area in computer graphics, faces unique challenges. A major challenge is making simultaneous edits across multiple objects or attributes. Applying these methods sequentially for multi-attribute edits increases computational demands and efficiency losses. In this paper, we address these challenges with significant contributions. Our main contribution is the development of ParallelEdits, a method that seamlessly manages simultaneous edits across multiple attributes. In contrast to previous approaches, ParallelEdits not only preserves the quality of single attribute edits but also significantly improves the performance of multitasking edits. This is achieved through innovative attention distribution mechanism and multi-branch design that operates across several processing heads. Additionally, we introduce the PIE-Bench++ dataset, an expansion of the original PIE-Bench dataset, to better support evaluating image-editing tasks involving multiple objects and attributes simultaneously. This dataset is a benchmark for evaluating text-driven image editing methods in multifaceted scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00985v3-abstract-full').style.display = 'none'; document.getElementById('2406.00985v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18320">arXiv:2405.18320</a> <span> [<a href="https://arxiv.org/pdf/2405.18320">pdf</a>, <a href="https://arxiv.org/format/2405.18320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Learning Based Handwriting Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chauhan%2C+M">Mihir Chauhan</a>, <a href="/search/cs?searchtype=author&query=Hashemi%2C+M+A">Mohammad Abuzar Hashemi</a>, <a href="/search/cs?searchtype=author&query=Satbhai%2C+A">Abhishek Satbhai</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+M+B">Mir Basheer Ali</a>, <a href="/search/cs?searchtype=author&query=Ramamurthy%2C+B">Bina Ramamurthy</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingchen Gao</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Srihari%2C+S">Sargur Srihari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18320v2-abstract-short" style="display: inline;"> We present SSL-HV: Self-Supervised Learning approaches applied to the task of Handwriting Verification. This task involves determining whether a given pair of handwritten images originate from the same or different writer distribution. We have compared the performance of multiple generative, contrastive SSL approaches against handcrafted feature extractors and supervised learning on CEDAR AND data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18320v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18320v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18320v2-abstract-full" style="display: none;"> We present SSL-HV: Self-Supervised Learning approaches applied to the task of Handwriting Verification. This task involves determining whether a given pair of handwritten images originate from the same or different writer distribution. We have compared the performance of multiple generative, contrastive SSL approaches against handcrafted feature extractors and supervised learning on CEDAR AND dataset. We show that ResNet based Variational Auto-Encoder (VAE) outperforms other generative approaches achieving 76.3% accuracy, while ResNet-18 fine-tuned using Variance-Invariance-Covariance Regularization (VICReg) outperforms other contrastive approaches achieving 78% accuracy. Using a pre-trained VAE and VICReg for the downstream task of writer verification we observed a relative improvement in accuracy of 6.7% and 9% over ResNet-18 supervised baseline with 10% writer labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18320v2-abstract-full').style.display = 'none'; document.getElementById('2405.18320v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, 2 tables, Accepted at Irish Machine Vision and Image Processing Conference 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17837">arXiv:2405.17837</a> <span> [<a href="https://arxiv.org/pdf/2405.17837">pdf</a>, <a href="https://arxiv.org/format/2405.17837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Enabling Generative Design Tools with LLM Agents for Mechanical Computation Devices: A Case Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qiuyu Lu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jiawei Fang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhihao Yao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shiqing Lyu</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+H">Haipeng Mi</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lining Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17837v3-abstract-short" style="display: inline;"> In the field of Human-Computer Interaction (HCI), interactive devices with embedded mechanical computation are gaining attention. The rise of these cutting-edge devices has created a need for specialized design tools that democratize the prototyping process. While current tools streamline prototyping through parametric design and simulation, they often come with a steep learning curve and may not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17837v3-abstract-full').style.display = 'inline'; document.getElementById('2405.17837v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17837v3-abstract-full" style="display: none;"> In the field of Human-Computer Interaction (HCI), interactive devices with embedded mechanical computation are gaining attention. The rise of these cutting-edge devices has created a need for specialized design tools that democratize the prototyping process. While current tools streamline prototyping through parametric design and simulation, they often come with a steep learning curve and may not fully support creative ideation. In this study, we use fluidic computation interfaces as a case study to explore how design tools for such devices can be augmented by Large Language Model agents (LLMs). Integrated with LLMs, the Generative Design Tool (GDT) better understands the capabilities and limitations of new technologies, proposes diverse and practical applications, and suggests designs that are technically and contextually appropriate. Additionally, it generates design parameters for visualizing results and producing fabrication-ready support files. This paper details the GDT's framework, implementation, and performance while addressing its potential and challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17837v3-abstract-full').style.display = 'none'; document.getElementById('2405.17837v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11326">arXiv:2405.11326</a> <span> [<a href="https://arxiv.org/pdf/2405.11326">pdf</a>, <a href="https://arxiv.org/format/2405.11326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On the Trajectory Regularity of ODE-based Diffusion Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhenyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11326v1-abstract-short" style="display: inline;"> Diffusion-based generative models use stochastic differential equations (SDEs) and their equivalent ordinary differential equations (ODEs) to establish a smooth connection between a complex data distribution and a tractable prior distribution. In this paper, we identify several intriguing trajectory properties in the ODE-based sampling process of diffusion models. We characterize an implicit denoi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11326v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11326v1-abstract-full" style="display: none;"> Diffusion-based generative models use stochastic differential equations (SDEs) and their equivalent ordinary differential equations (ODEs) to establish a smooth connection between a complex data distribution and a tractable prior distribution. In this paper, we identify several intriguing trajectory properties in the ODE-based sampling process of diffusion models. We characterize an implicit denoising trajectory and discuss its vital role in forming the coupled sampling trajectory with a strong shape regularity, regardless of the generated content. We also describe a dynamic programming-based scheme to make the time schedule in sampling better fit the underlying trajectory structure. This simple strategy requires minimal modification to any given ODE-based numerical solvers and incurs negligible computational cost, while delivering superior performance in image generation, especially in $5\sim 10$ function evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11326v1-abstract-full').style.display = 'none'; document.getElementById('2405.11326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024, 30 pages. arXiv admin note: text overlap with arXiv:2305.19947</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.08487">arXiv:2405.08487</a> <span> [<a href="https://arxiv.org/pdf/2405.08487">pdf</a>, <a href="https://arxiv.org/format/2405.08487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Semantic Contextualization of Face Forgery: A New Definition, Dataset, and Detection Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+M">Mian Zou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Baosheng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yibing Zhan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kede Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.08487v1-abstract-short" style="display: inline;"> In recent years, deep learning has greatly streamlined the process of generating realistic fake face images. Aware of the dangers, researchers have developed various tools to spot these counterfeits. Yet none asked the fundamental question: What digital manipulations make a real photographic face image fake, while others do not? In this paper, we put face forgery in a semantic context and define t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08487v1-abstract-full').style.display = 'inline'; document.getElementById('2405.08487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.08487v1-abstract-full" style="display: none;"> In recent years, deep learning has greatly streamlined the process of generating realistic fake face images. Aware of the dangers, researchers have developed various tools to spot these counterfeits. Yet none asked the fundamental question: What digital manipulations make a real photographic face image fake, while others do not? In this paper, we put face forgery in a semantic context and define that computational methods that alter semantic face attributes to exceed human discrimination thresholds are sources of face forgery. Guided by our new definition, we construct a large face forgery image dataset, where each image is associated with a set of labels organized in a hierarchical graph. Our dataset enables two new testing protocols to probe the generalization of face forgery detectors. Moreover, we propose a semantics-oriented face forgery detection method that captures label relations and prioritizes the primary task (\ie, real or fake face detection). We show that the proposed dataset successfully exposes the weaknesses of current detectors as the test set and consistently improves their generalizability as the training set. Additionally, we demonstrate the superiority of our semantics-oriented method over traditional binary and multi-class classification-based detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08487v1-abstract-full').style.display = 'none'; document.getElementById('2405.08487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04051">arXiv:2405.04051</a> <span> [<a href="https://arxiv.org/pdf/2405.04051">pdf</a>, <a href="https://arxiv.org/ps/2405.04051">ps</a>, <a href="https://arxiv.org/format/2405.04051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> On the quantization goodness of polar lattices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shanxiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Cong Ling</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+B">Baoming Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04051v2-abstract-short" style="display: inline;"> In this work, we prove that polar lattices, when tailored for lossy compression, are quantization-good in the sense that their normalized second moments approach $\frac{1}{2蟺e}$ as the dimension of lattices increases. It has been predicted by Zamir et al. \cite{ZamirQZ96} that the Entropy Coded Dithered Quantization (ECDQ) system using quantization-good lattices can achieve the rate-distortion bou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04051v2-abstract-full').style.display = 'inline'; document.getElementById('2405.04051v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04051v2-abstract-full" style="display: none;"> In this work, we prove that polar lattices, when tailored for lossy compression, are quantization-good in the sense that their normalized second moments approach $\frac{1}{2蟺e}$ as the dimension of lattices increases. It has been predicted by Zamir et al. \cite{ZamirQZ96} that the Entropy Coded Dithered Quantization (ECDQ) system using quantization-good lattices can achieve the rate-distortion bound of i.i.d. Gaussian sources. In our previous work \cite{LingQZ}, we established that polar lattices are indeed capable of attaining the same objective. It is reasonable to conjecture that polar lattices also demonstrate quantization goodness in the context of lossy compression. This study confirms this hypothesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04051v2-abstract-full').style.display = 'none'; document.getElementById('2405.04051v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures, submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00135">arXiv:2405.00135</a> <span> [<a href="https://arxiv.org/pdf/2405.00135">pdf</a>, <a href="https://arxiv.org/format/2405.00135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Improving Channel Resilience for Task-Oriented Semantic Communications: A Unified Information Bottleneck Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuai Lyu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yao Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Linke Guo</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaoyong Yuan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+F">Fang Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianbin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00135v1-abstract-short" style="display: inline;"> Task-oriented semantic communications (TSC) enhance radio resource efficiency by transmitting task-relevant semantic information. However, current research often overlooks the inherent semantic distinctions among encoded features. Due to unavoidable channel variations from time and frequency-selective fading, semantically sensitive feature units could be more susceptible to erroneous inference if… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00135v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00135v1-abstract-full" style="display: none;"> Task-oriented semantic communications (TSC) enhance radio resource efficiency by transmitting task-relevant semantic information. However, current research often overlooks the inherent semantic distinctions among encoded features. Due to unavoidable channel variations from time and frequency-selective fading, semantically sensitive feature units could be more susceptible to erroneous inference if corrupted by dynamic channels. Therefore, this letter introduces a unified channel-resilient TSC framework via information bottleneck. This framework complements existing TSC approaches by controlling information flow to capture fine-grained feature-level semantic robustness. Experiments on a case study for real-time subchannel allocation validate the framework's effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00135v1-abstract-full').style.display = 'none'; document.getElementById('2405.00135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE Communications Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19171">arXiv:2404.19171</a> <span> [<a href="https://arxiv.org/pdf/2404.19171">pdf</a>, <a href="https://arxiv.org/format/2404.19171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Explicit Correlation Learning for Generalizable Cross-Modal Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+C">Cai Yu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiaomeng Fu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jin Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiahe Tian</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jiao Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19171v1-abstract-short" style="display: inline;"> With the rising prevalence of deepfakes, there is a growing interest in developing generalizable detection methods for various types of deepfakes. While effective in their specific modalities, traditional detection methods fall short in addressing the generalizability of detection across diverse cross-modal deepfakes. This paper aims to explicitly learn potential cross-modal correlation to enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19171v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19171v1-abstract-full" style="display: none;"> With the rising prevalence of deepfakes, there is a growing interest in developing generalizable detection methods for various types of deepfakes. While effective in their specific modalities, traditional detection methods fall short in addressing the generalizability of detection across diverse cross-modal deepfakes. This paper aims to explicitly learn potential cross-modal correlation to enhance deepfake detection towards various generation scenarios. Our approach introduces a correlation distillation task, which models the inherent cross-modal correlation based on content information. This strategy helps to prevent the model from overfitting merely to audio-visual synchronization. Additionally, we present the Cross-Modal Deepfake Dataset (CMDFD), a comprehensive dataset with four generation methods to evaluate the detection of diverse cross-modal deepfakes. The experimental results on CMDFD and FakeAVCeleb datasets demonstrate the superior generalizability of our method over existing state-of-the-art methods. Our code and data can be found at \url{https://github.com/ljj898/CMDFD-Dataset-and-Deepfake-Detection}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19171v1-abstract-full').style.display = 'none'; document.getElementById('2404.19171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICME 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18033">arXiv:2404.18033</a> <span> [<a href="https://arxiv.org/pdf/2404.18033">pdf</a>, <a href="https://arxiv.org/format/2404.18033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exposing Text-Image Inconsistency Using Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mingzhen Huang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhou Zhou</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yan Ju</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jialing Cai</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18033v1-abstract-short" style="display: inline;"> In the battle against widespread online misinformation, a growing problem is text-image inconsistency, where images are misleadingly paired with texts with different intent or meaning. Existing classification-based methods for text-image inconsistency can identify contextual inconsistencies but fail to provide explainable justifications for their decisions that humans can understand. Although more… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18033v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18033v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18033v1-abstract-full" style="display: none;"> In the battle against widespread online misinformation, a growing problem is text-image inconsistency, where images are misleadingly paired with texts with different intent or meaning. Existing classification-based methods for text-image inconsistency can identify contextual inconsistencies but fail to provide explainable justifications for their decisions that humans can understand. Although more nuanced, human evaluation is impractical at scale and susceptible to errors. To address these limitations, this study introduces D-TIIL (Diffusion-based Text-Image Inconsistency Localization), which employs text-to-image diffusion models to localize semantic inconsistencies in text and image pairs. These models, trained on large-scale datasets act as ``omniscient" agents that filter out irrelevant information and incorporate background knowledge to identify inconsistencies. In addition, D-TIIL uses text embeddings and modified image regions to visualize these inconsistencies. To evaluate D-TIIL's efficacy, we introduce a new TIIL dataset containing 14K consistent and inconsistent text-image pairs. Unlike existing datasets, TIIL enables assessment at the level of individual words and image regions and is carefully designed to represent various inconsistencies. D-TIIL offers a scalable and evidence-based approach to identifying and localizing text-image inconsistency, providing a robust framework for future research combating misinformation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18033v1-abstract-full').style.display = 'none'; document.getElementById('2404.18033v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13146">arXiv:2404.13146</a> <span> [<a href="https://arxiv.org/pdf/2404.13146">pdf</a>, <a href="https://arxiv.org/format/2404.13146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DeepFake-O-Meter v2.0: An Open Platform for DeepFake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yan Ju</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chengzhe Sun</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+S">Shuwei Hou</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Z">Zhaofeng Si</a>, <a href="/search/cs?searchtype=author&query=Datta%2C+S+K">Soumyya Kanti Datta</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+L">Lipeng Ke</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Riky Zhou</a>, <a href="/search/cs?searchtype=author&query=Nikolich%2C+A">Anita Nikolich</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13146v2-abstract-short" style="display: inline;"> Deepfakes, as AI-generated media, have increasingly threatened media integrity and personal privacy with realistic yet fake digital content. In this work, we introduce an open-source and user-friendly online platform, DeepFake-O-Meter v2.0, that integrates state-of-the-art methods for detecting Deepfake images, videos, and audio. Built upon DeepFake-O-Meter v1.0, we have made significant upgrades… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13146v2-abstract-full').style.display = 'inline'; document.getElementById('2404.13146v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13146v2-abstract-full" style="display: none;"> Deepfakes, as AI-generated media, have increasingly threatened media integrity and personal privacy with realistic yet fake digital content. In this work, we introduce an open-source and user-friendly online platform, DeepFake-O-Meter v2.0, that integrates state-of-the-art methods for detecting Deepfake images, videos, and audio. Built upon DeepFake-O-Meter v1.0, we have made significant upgrades and improvements in platform architecture design, including user interaction, detector integration, job balancing, and security management. The platform aims to offer everyday users a convenient service for analyzing DeepFake media using multiple state-of-the-art detection algorithms. It ensures secure and private delivery of the analysis results. Furthermore, it serves as an evaluation and benchmarking platform for researchers in digital media forensics to compare the performance of multiple algorithms on the same input. We have also conducted detailed usage analysis based on the collected data to gain deeper insights into our platform's statistics. This involves analyzing two-month trends in user activity and evaluating the processing efficiency of each detector. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13146v2-abstract-full').style.display = 'none'; document.getElementById('2404.13146v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14077">arXiv:2403.14077</a> <span> [<a href="https://arxiv.org/pdf/2403.14077">pdf</a>, <a href="https://arxiv.org/format/2403.14077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Can ChatGPT Detect DeepFakes? A Study of Using Multimodal Large Language Models for Media Forensics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+R">Reilin Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kangran Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yize Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yan Ju</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chuanbo Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14077v4-abstract-short" style="display: inline;"> DeepFakes, which refer to AI-generated media content, have become an increasing concern due to their use as a means for disinformation. Detecting DeepFakes is currently solved with programmed machine learning algorithms. In this work, we investigate the capabilities of multimodal large language models (LLMs) in DeepFake detection. We conducted qualitative and quantitative experiments to demonstrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14077v4-abstract-full').style.display = 'inline'; document.getElementById('2403.14077v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14077v4-abstract-full" style="display: none;"> DeepFakes, which refer to AI-generated media content, have become an increasing concern due to their use as a means for disinformation. Detecting DeepFakes is currently solved with programmed machine learning algorithms. In this work, we investigate the capabilities of multimodal large language models (LLMs) in DeepFake detection. We conducted qualitative and quantitative experiments to demonstrate multimodal LLMs and show that they can expose AI-generated images through careful experimental design and prompt engineering. This is interesting, considering that LLMs are not inherently tailored for media forensic tasks, and the process does not require programming. We discuss the limitations of multimodal LLMs for these tasks and suggest possible improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14077v4-abstract-full').style.display = 'none'; document.getElementById('2403.14077v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13358">arXiv:2403.13358</a> <span> [<a href="https://arxiv.org/pdf/2403.13358">pdf</a>, <a href="https://arxiv.org/format/2403.13358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped Robot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+W">Wenxuan Song</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+P">Pengxiang Ding</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shangke Lyu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yaning Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Donglin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13358v2-abstract-short" style="display: inline;"> Multi-task robot learning holds significant importance in tackling diverse and complex scenarios. However, current approaches are hindered by performance issues and difficulties in collecting training datasets. In this paper, we propose GeRM (Generalist Robotic Model). We utilize offline reinforcement learning to optimize data utilization strategies to learn from both demonstrations and sub-optima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13358v2-abstract-full').style.display = 'inline'; document.getElementById('2403.13358v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13358v2-abstract-full" style="display: none;"> Multi-task robot learning holds significant importance in tackling diverse and complex scenarios. However, current approaches are hindered by performance issues and difficulties in collecting training datasets. In this paper, we propose GeRM (Generalist Robotic Model). We utilize offline reinforcement learning to optimize data utilization strategies to learn from both demonstrations and sub-optimal data, thus surpassing the limitations of human demonstrations. Thereafter, we employ a transformer-based VLA network to process multi-modal inputs and output actions. By introducing the Mixture-of-Experts structure, GeRM allows faster inference speed with higher whole model capacity, and thus resolves the issue of limited RL parameters, enhancing model performance in multi-task learning while controlling computational costs. Through a series of experiments, we demonstrate that GeRM outperforms other methods across all tasks, while also validating its efficiency in both training and inference processes. Additionally, we uncover its potential to acquire emergent skills. Additionally, we contribute the QUARD-Auto dataset, collected automatically to support our training approach and foster advancements in multi-task quadruped robot learning. This work presents a new paradigm for reducing the cost of collecting robot data and driving progress in the multi-task learning community. You can reach our project and video through the link: https://songwxuan.github.io/GeRM/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13358v2-abstract-full').style.display = 'none'; document.getElementById('2403.13358v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12631">arXiv:2403.12631</a> <span> [<a href="https://arxiv.org/pdf/2403.12631">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PointGrasp: Point Cloud-based Grasping for Tendon-driven Soft Robotic Glove Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shirui Lyu</a>, <a href="/search/cs?searchtype=author&query=Rho%2C+E">Eojin Rho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daekyum Kim</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&query=Gionfrida%2C+L">Letizia Gionfrida</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12631v1-abstract-short" style="display: inline;"> Controlling hand exoskeletons to assist individuals with grasping tasks poses a challenge due to the difficulty in understanding user intentions. We propose that most daily grasping tasks during activities of daily living (ADL) can be deduced by analyzing object geometries (simple and complex) from 3D point clouds. The study introduces PointGrasp, a real-time system designed for identifying househ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12631v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12631v1-abstract-full" style="display: none;"> Controlling hand exoskeletons to assist individuals with grasping tasks poses a challenge due to the difficulty in understanding user intentions. We propose that most daily grasping tasks during activities of daily living (ADL) can be deduced by analyzing object geometries (simple and complex) from 3D point clouds. The study introduces PointGrasp, a real-time system designed for identifying household scenes semantically, aiming to support and enhance assistance during ADL for tailored end-to-end grasping tasks. The system comprises an RGB-D camera with an inertial measurement unit and a microprocessor integrated into a tendon-driven soft robotic glove. The RGB-D camera processes 3D scenes at a rate exceeding 30 frames per second. The proposed pipeline demonstrates an average RMSE of 0.8 $\pm$ 0.39 cm for simple and 0.11 $\pm$ 0.06 cm for complex geometries. Within each mode, it identifies and pinpoints reachable objects. This system shows promise in end-to-end vision-driven robotic-assisted rehabilitation manual tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12631v1-abstract-full').style.display = 'none'; document.getElementById('2403.12631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 8 figures, conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03101">arXiv:2403.03101</a> <span> [<a href="https://arxiv.org/pdf/2403.03101">pdf</a>, <a href="https://arxiv.org/format/2403.03101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> KnowAgent: Knowledge-Augmented Planning for LLM-Based Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+S">Shuofei Qiao</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+Y">Yixin Ou</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shumin Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shiwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yue Shen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Lei Liang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinjie Gu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03101v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated great potential in complex reasoning tasks, yet they fall short when tackling more sophisticated challenges, especially when interacting with environments through generating executable actions. This inadequacy primarily stems from the lack of built-in action knowledge in language agents, which fails to effectively guide the planning trajectories durin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03101v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03101v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated great potential in complex reasoning tasks, yet they fall short when tackling more sophisticated challenges, especially when interacting with environments through generating executable actions. This inadequacy primarily stems from the lack of built-in action knowledge in language agents, which fails to effectively guide the planning trajectories during task solving and results in planning hallucination. To address this issue, we introduce KnowAgent, a novel approach designed to enhance the planning capabilities of LLMs by incorporating explicit action knowledge. Specifically, KnowAgent employs an action knowledge base and a knowledgeable self-learning strategy to constrain the action path during planning, enabling more reasonable trajectory synthesis, and thereby enhancing the planning performance of language agents. Experimental results on HotpotQA and ALFWorld based on various backbone models demonstrate that KnowAgent can achieve comparable or superior performance to existing baselines. Further analysis indicates the effectiveness of KnowAgent in terms of planning hallucinations mitigation. Code is available in https://github.com/zjunlp/KnowAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03101v1-abstract-full').style.display = 'none'; document.getElementById('2403.03101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Project page: https://zjunlp.github.io/project/KnowAgent/ Code: https://github.com/zjunlp/KnowAgent</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01154">arXiv:2402.01154</a> <span> [<a href="https://arxiv.org/pdf/2402.01154">pdf</a>, <a href="https://arxiv.org/format/2402.01154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Towards Quantum-Safe Federated Learning via Homomorphic Encryption: Learning with Gradients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+G">Guangfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shanxiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+H">Hanxu Hou</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhiyong Zheng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01154v1-abstract-short" style="display: inline;"> This paper introduces a privacy-preserving distributed learning framework via private-key homomorphic encryption. Thanks to the randomness of the quantization of gradients, our learning with error (LWE) based encryption can eliminate the error terms, thus avoiding the issue of error expansion in conventional LWE-based homomorphic encryption. The proposed system allows a large number of learning pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01154v1-abstract-full').style.display = 'inline'; document.getElementById('2402.01154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01154v1-abstract-full" style="display: none;"> This paper introduces a privacy-preserving distributed learning framework via private-key homomorphic encryption. Thanks to the randomness of the quantization of gradients, our learning with error (LWE) based encryption can eliminate the error terms, thus avoiding the issue of error expansion in conventional LWE-based homomorphic encryption. The proposed system allows a large number of learning participants to engage in neural network-based deep learning collaboratively over an honest-but-curious server, while ensuring the cryptographic security of participants' uploaded gradients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01154v1-abstract-full').style.display = 'none'; document.getElementById('2402.01154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10113">arXiv:2401.10113</a> <span> [<a href="https://arxiv.org/pdf/2401.10113">pdf</a>, <a href="https://arxiv.org/ps/2401.10113">ps</a>, <a href="https://arxiv.org/format/2401.10113">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exposing Lip-syncing Deepfakes from Mouth Inconsistencies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Datta%2C+S+K">Soumyya Kanti Datta</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shan Jia</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10113v2-abstract-short" style="display: inline;"> A lip-syncing deepfake is a digitally manipulated video in which a person's lip movements are created convincingly using AI models to match altered or entirely new audio. Lip-syncing deepfakes are a dangerous type of deepfakes as the artifacts are limited to the lip region and more difficult to discern. In this paper, we describe a novel approach, LIP-syncing detection based on mouth INConsistency… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10113v2-abstract-full').style.display = 'inline'; document.getElementById('2401.10113v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10113v2-abstract-full" style="display: none;"> A lip-syncing deepfake is a digitally manipulated video in which a person's lip movements are created convincingly using AI models to match altered or entirely new audio. Lip-syncing deepfakes are a dangerous type of deepfakes as the artifacts are limited to the lip region and more difficult to discern. In this paper, we describe a novel approach, LIP-syncing detection based on mouth INConsistency (LIPINC), for lip-syncing deepfake detection by identifying temporal inconsistencies in the mouth region. These inconsistencies are seen in the adjacent frames and throughout the video. Our model can successfully capture these irregularities and outperforms the state-of-the-art methods on several benchmark deepfake datasets. Code is available at https://github.com/skrantidatta/LIPINC <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10113v2-abstract-full').style.display = 'none'; document.getElementById('2401.10113v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17431">arXiv:2312.17431</a> <span> [<a href="https://arxiv.org/pdf/2312.17431">pdf</a>, <a href="https://arxiv.org/format/2312.17431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVPatch: More Vivid Patch for Adversarial Camouflaged Attacks on Object Detectors in the Physical World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Ju Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiaosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+L">Liwei Geng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenquan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17431v3-abstract-short" style="display: inline;"> Recent studies have shown that Adversarial Patches (APs) can effectively manipulate object detection models. However, the conspicuous patterns often associated with these patches tend to attract human attention, posing a significant challenge. Existing research has primarily focused on enhancing attack efficacy in the physical domain while often neglecting the optimization of stealthiness and tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17431v3-abstract-full').style.display = 'inline'; document.getElementById('2312.17431v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17431v3-abstract-full" style="display: none;"> Recent studies have shown that Adversarial Patches (APs) can effectively manipulate object detection models. However, the conspicuous patterns often associated with these patches tend to attract human attention, posing a significant challenge. Existing research has primarily focused on enhancing attack efficacy in the physical domain while often neglecting the optimization of stealthiness and transferability. Furthermore, applying APs in real-world scenarios faces major challenges related to transferability, stealthiness, and practicality. To address these challenges, we introduce generalization theory into the context of APs, enabling our iterative process to simultaneously enhance transferability and refine visual correlation with realistic images. We propose a Dual-Perception-Based Framework (DPBF) to generate the More Vivid Patch (MVPatch), which enhances transferability, stealthiness, and practicality. The DPBF integrates two key components: the Model-Perception-Based Module (MPBM) and the Human-Perception-Based Module (HPBM), along with regularization terms. The MPBM employs ensemble strategy to reduce object confidence scores across multiple detectors, thereby improving AP transferability with robust theoretical support. Concurrently, the HPBM introduces a lightweight method for achieving visual similarity, creating natural and inconspicuous adversarial patches without relying on additional generative models. The regularization terms further enhance the practicality of the generated APs in the physical domain. Additionally, we introduce naturalness and transferability scores to provide an unbiased assessment of APs. Extensive experimental validation demonstrates that MVPatch achieves superior transferability and a natural appearance in both digital and physical domains, underscoring its effectiveness and stealthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17431v3-abstract-full').style.display = 'none'; document.getElementById('2312.17431v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09785">arXiv:2312.09785</a> <span> [<a href="https://arxiv.org/pdf/2312.09785">pdf</a>, <a href="https://arxiv.org/format/2312.09785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RJUA-QA: A Comprehensive QA Dataset for Urology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shiwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+C">Chenfei Chi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hongbo Cai</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lei Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyan Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xianguo Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fangzhou Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaowei Ma</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yue Shen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinjie Gu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09785v3-abstract-short" style="display: inline;"> We introduce RJUA-QA, a novel medical dataset for question answering (QA) and reasoning with clinical evidence, contributing to bridge the gap between general large language models (LLMs) and medical-specific LLM applications. RJUA-QA is derived from realistic clinical scenarios and aims to facilitate LLMs in generating reliable diagnostic and advice. The dataset contains 2,132 curated Question-Co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09785v3-abstract-full').style.display = 'inline'; document.getElementById('2312.09785v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09785v3-abstract-full" style="display: none;"> We introduce RJUA-QA, a novel medical dataset for question answering (QA) and reasoning with clinical evidence, contributing to bridge the gap between general large language models (LLMs) and medical-specific LLM applications. RJUA-QA is derived from realistic clinical scenarios and aims to facilitate LLMs in generating reliable diagnostic and advice. The dataset contains 2,132 curated Question-Context-Answer pairs, corresponding about 25,000 diagnostic records and clinical cases. The dataset covers 67 common urological disease categories, where the disease coverage exceeds 97.6\% of the population seeking medical services in urology. Each data instance in RJUA-QA comprises: (1) a question mirroring real patient to inquiry about clinical symptoms and medical conditions, (2) a context including comprehensive expert knowledge, serving as a reference for medical examination and diagnosis, (3) a doctor response offering the diagnostic conclusion and suggested examination guidance, (4) a diagnosed clinical disease as the recommended diagnostic outcome, and (5) clinical advice providing recommendations for medical examination. RJUA-QA is the first medical QA dataset for clinical reasoning over the patient inquiries, where expert-level knowledge and experience are required for yielding diagnostic conclusions and medical examination advice. A comprehensive evaluation is conducted to evaluate the performance of both medical-specific and general LLMs on the RJUA-QA dataset. Our data is are publicly available at \url{https://github.com/alipay/RJU_Ant_QA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09785v3-abstract-full').style.display = 'none'; document.getElementById('2312.09785v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An initial version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09639">arXiv:2312.09639</a> <span> [<a href="https://arxiv.org/pdf/2312.09639">pdf</a>, <a href="https://arxiv.org/format/2312.09639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3511808.3557655">10.1145/3511808.3557655 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multiple Instance Learning for Uplift Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shiwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Ruiying Jiang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinjie Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guannan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09639v1-abstract-short" style="display: inline;"> Uplift modeling is widely used in performance marketing to estimate effects of promotion campaigns (e.g., increase of customer retention rate). Since it is impossible to observe outcomes of a recipient in treatment (e.g., receiving a certain promotion) and control (e.g., without promotion) groups simultaneously (i.e., counter-factual), uplift models are mainly trained on instances of treatment and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09639v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09639v1-abstract-full" style="display: none;"> Uplift modeling is widely used in performance marketing to estimate effects of promotion campaigns (e.g., increase of customer retention rate). Since it is impossible to observe outcomes of a recipient in treatment (e.g., receiving a certain promotion) and control (e.g., without promotion) groups simultaneously (i.e., counter-factual), uplift models are mainly trained on instances of treatment and control groups separately to form two models respectively, and uplifts are predicted by the difference of predictions from these two models (i.e., two-model method). When responses are noisy and the treatment effect is fractional, induced individual uplift predictions will be inaccurate, resulting in targeting undesirable customers. Though it is impossible to obtain the ideal ground-truth individual uplifts, known as Individual Treatment Effects (ITEs), alternatively, an average uplift of a group of users, called Average Treatment Effect (ATE), can be observed from experimental deliveries. Upon this, similar to Multiple Instance Learning (MIL) in which each training sample is a bag of instances, our framework sums up individual user uplift predictions for each bag of users as its bag-wise ATE prediction, and regularizes it to its ATE label, thus learning more accurate individual uplifts. Additionally, to amplify the fractional treatment effect, bags are composed of instances with adjacent individual uplift predictions, instead of random instances. Experiments conducted on two datasets show the effectiveness and universality of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09639v1-abstract-full').style.display = 'none'; document.getElementById('2312.09639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">short paper of CIKM22(full version)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 31st ACM International Conference on Information and Knowledge Management (2022) 4727-4731 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05738">arXiv:2312.05738</a> <span> [<a href="https://arxiv.org/pdf/2312.05738">pdf</a>, <a href="https://arxiv.org/format/2312.05738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FedReverse: Multiparty Reversible Deep Neural Network Watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+J">Junlong Mao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huiyi Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fengxia Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhiyong Zheng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shanxiang Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05738v1-abstract-short" style="display: inline;"> The proliferation of Deep Neural Networks (DNN) in commercial applications is expanding rapidly. Simultaneously, the increasing complexity and cost of training DNN models have intensified the urgency surrounding the protection of intellectual property associated with these trained models. In this regard, DNN watermarking has emerged as a crucial safeguarding technique. This paper presents FedRever… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05738v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05738v1-abstract-full" style="display: none;"> The proliferation of Deep Neural Networks (DNN) in commercial applications is expanding rapidly. Simultaneously, the increasing complexity and cost of training DNN models have intensified the urgency surrounding the protection of intellectual property associated with these trained models. In this regard, DNN watermarking has emerged as a crucial safeguarding technique. This paper presents FedReverse, a novel multiparty reversible watermarking approach for robust copyright protection while minimizing performance impact. Unlike existing methods, FedReverse enables collaborative watermark embedding from multiple parties after model training, ensuring individual copyright claims. In addition, FedReverse is reversible, enabling complete watermark removal with unanimous client consent. FedReverse demonstrates perfect covering, ensuring that observations of watermarked content do not reveal any information about the hidden watermark. Additionally, it showcases resistance against Known Original Attacks (KOA), making it highly challenging for attackers to forge watermarks or infer the key. This paper further evaluates FedReverse through comprehensive simulations involving Multi-layer Perceptron (MLP) and Convolutional Neural Networks (CNN) trained on the MNIST dataset. The simulations demonstrate FedReverse's robustness, reversibility, and minimal impact on model accuracy across varying embedding parameters and multiple client scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05738v1-abstract-full').style.display = 'none'; document.getElementById('2312.05738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11278">arXiv:2311.11278</a> <span> [<a href="https://arxiv.org/pdf/2311.11278">pdf</a>, <a href="https://arxiv.org/format/2311.11278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transcending Forgery Specificity with Latent Space Augmentation for Generalizable Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuhao Luo</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingshan Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11278v2-abstract-short" style="display: inline;"> Deepfake detection faces a critical generalization hurdle, with performance deteriorating when there is a mismatch between the distributions of training and testing data. A broadly received explanation is the tendency of these detectors to be overfitted to forgery-specific artifacts, rather than learning features that are widely applicable across various forgeries. To address this issue, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11278v2-abstract-full').style.display = 'inline'; document.getElementById('2311.11278v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11278v2-abstract-full" style="display: none;"> Deepfake detection faces a critical generalization hurdle, with performance deteriorating when there is a mismatch between the distributions of training and testing data. A broadly received explanation is the tendency of these detectors to be overfitted to forgery-specific artifacts, rather than learning features that are widely applicable across various forgeries. To address this issue, we propose a simple yet effective detector called LSDA (\underline{L}atent \underline{S}pace \underline{D}ata \underline{A}ugmentation), which is based on a heuristic idea: representations with a wider variety of forgeries should be able to learn a more generalizable decision boundary, thereby mitigating the overfitting of method-specific features (see Fig.~\ref{fig:toy}). Following this idea, we propose to enlarge the forgery space by constructing and simulating variations within and across forgery features in the latent space. This approach encompasses the acquisition of enriched, domain-specific features and the facilitation of smoother transitions between different forgery types, effectively bridging domain gaps. Our approach culminates in refining a binary classifier that leverages the distilled knowledge from the enhanced features, striving for a generalizable deepfake detector. Comprehensive experiments show that our proposed method is surprisingly effective and transcends state-of-the-art detectors across several widely used benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11278v2-abstract-full').style.display = 'none'; document.getElementById('2311.11278v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.06015">arXiv:2311.06015</a> <span> [<a href="https://arxiv.org/pdf/2311.06015">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RSG: Fast Learning Adaptive Skills for Quadruped Robots by Skill Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+D">Diyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Z">Zifeng Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenyu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Feng Zhao</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+S">Sibo Gai</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shangke Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Donglin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.06015v1-abstract-short" style="display: inline;"> Developing robotic intelligent systems that can adapt quickly to unseen wild situations is one of the critical challenges in pursuing autonomous robotics. Although some impressive progress has been made in walking stability and skill learning in the field of legged robots, their ability to fast adaptation is still inferior to that of animals in nature. Animals are born with massive skills needed t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06015v1-abstract-full').style.display = 'inline'; document.getElementById('2311.06015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.06015v1-abstract-full" style="display: none;"> Developing robotic intelligent systems that can adapt quickly to unseen wild situations is one of the critical challenges in pursuing autonomous robotics. Although some impressive progress has been made in walking stability and skill learning in the field of legged robots, their ability to fast adaptation is still inferior to that of animals in nature. Animals are born with massive skills needed to survive, and can quickly acquire new ones, by composing fundamental skills with limited experience. Inspired by this, we propose a novel framework, named Robot Skill Graph (RSG) for organizing massive fundamental skills of robots and dexterously reusing them for fast adaptation. Bearing a structure similar to the Knowledge Graph (KG), RSG is composed of massive dynamic behavioral skills instead of static knowledge in KG and enables discovering implicit relations that exist in be-tween of learning context and acquired skills of robots, serving as a starting point for understanding subtle patterns existing in robots' skill learning. Extensive experimental results demonstrate that RSG can provide rational skill inference upon new tasks and environments and enable quadruped robots to adapt to new scenarios and learn new skills rapidly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06015v1-abstract-full').style.display = 'none'; document.getElementById('2311.06015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05836">arXiv:2311.05836</a> <span> [<a href="https://arxiv.org/pdf/2311.05836">pdf</a>, <a href="https://arxiv.org/format/2311.05836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UMedNeRF: Uncertainty-aware Single View Volumetric Rendering for Medical Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jing Hu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Q">Qinrui Fan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05836v7-abstract-short" style="display: inline;"> In the field of clinical medicine, computed tomography (CT) is an effective medical imaging modality for the diagnosis of various pathologies. Compared with X-ray images, CT images can provide more information, including multi-planar slices and three-dimensional structures for clinical diagnosis. However, CT imaging requires patients to be exposed to large doses of ionizing radiation for a long ti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05836v7-abstract-full').style.display = 'inline'; document.getElementById('2311.05836v7-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05836v7-abstract-full" style="display: none;"> In the field of clinical medicine, computed tomography (CT) is an effective medical imaging modality for the diagnosis of various pathologies. Compared with X-ray images, CT images can provide more information, including multi-planar slices and three-dimensional structures for clinical diagnosis. However, CT imaging requires patients to be exposed to large doses of ionizing radiation for a long time, which may cause irreversible physical harm. In this paper, we propose an Uncertainty-aware MedNeRF (UMedNeRF) network based on generated radiation fields. The network can learn a continuous representation of CT projections from 2D X-ray images by obtaining the internal structure and depth information and using adaptive loss weights to ensure the quality of the generated images. Our model is trained on publicly available knee and chest datasets, and we show the results of CT projection rendering with a single X-ray and compare our method with other methods based on generated radiation fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05836v7-abstract-full').style.display = 'none'; document.getElementById('2311.05836v7-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.02926">arXiv:2311.02926</a> <span> [<a href="https://arxiv.org/pdf/2311.02926">pdf</a>, <a href="https://arxiv.org/format/2311.02926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Image Semantic Communication Model for Artificial Intelligent Internet of Things </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+L+P">Li Ping Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Sikai Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Huijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuan Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X+S">Xuemin Sherman Shen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoniu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.02926v2-abstract-short" style="display: inline;"> With the rapid development of Artificial Intelligent Internet of Things (AIoT), the image data from AIoT devices has been witnessing the explosive increasing. In this paper, a novel deep image semantic communication model is proposed for the efficient image communication in AIoT. Particularly, at the transmitter side, a high-precision image semantic segmentation algorithm is proposed to extract th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02926v2-abstract-full').style.display = 'inline'; document.getElementById('2311.02926v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.02926v2-abstract-full" style="display: none;"> With the rapid development of Artificial Intelligent Internet of Things (AIoT), the image data from AIoT devices has been witnessing the explosive increasing. In this paper, a novel deep image semantic communication model is proposed for the efficient image communication in AIoT. Particularly, at the transmitter side, a high-precision image semantic segmentation algorithm is proposed to extract the semantic information of the image to achieve significant compression of the image data. At the receiver side, a semantic image restoration algorithm based on Generative Adversarial Network (GAN) is proposed to convert the semantic image to a real scene image with detailed information. Simulation results demonstrate that the proposed image semantic communication model can improve the image compression ratio and recovery accuracy by 71.93% and 25.07% on average in comparison with WebP and CycleGAN, respectively. More importantly, our demo experiment shows that the proposed model reduces the total delay by 95.26% in the image communication, when comparing with the original image transmission. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02926v2-abstract-full').style.display = 'none'; document.getElementById('2311.02926v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.14374">arXiv:2310.14374</a> <span> [<a href="https://arxiv.org/pdf/2310.14374">pdf</a>, <a href="https://arxiv.org/format/2310.14374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OV-VG: A Benchmark for Open-Vocabulary Visual Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunlei Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenquan Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shuchang Lyu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Binghao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lijiang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.14374v1-abstract-short" style="display: inline;"> Open-vocabulary learning has emerged as a cutting-edge research area, particularly in light of the widespread adoption of vision-based foundational models. Its primary objective is to comprehend novel concepts that are not encompassed within a predefined vocabulary. One key facet of this endeavor is Visual Grounding, which entails locating a specific region within an image based on a corresponding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14374v1-abstract-full').style.display = 'inline'; document.getElementById('2310.14374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.14374v1-abstract-full" style="display: none;"> Open-vocabulary learning has emerged as a cutting-edge research area, particularly in light of the widespread adoption of vision-based foundational models. Its primary objective is to comprehend novel concepts that are not encompassed within a predefined vocabulary. One key facet of this endeavor is Visual Grounding, which entails locating a specific region within an image based on a corresponding language description. While current foundational models excel at various visual language tasks, there's a noticeable absence of models specifically tailored for open-vocabulary visual grounding. This research endeavor introduces novel and challenging OV tasks, namely Open-Vocabulary Visual Grounding and Open-Vocabulary Phrase Localization. The overarching aim is to establish connections between language descriptions and the localization of novel objects. To facilitate this, we have curated a comprehensive annotated benchmark, encompassing 7,272 OV-VG images and 1,000 OV-PL images. In our pursuit of addressing these challenges, we delved into various baseline methodologies rooted in existing open-vocabulary object detection, VG, and phrase localization frameworks. Surprisingly, we discovered that state-of-the-art methods often falter in diverse scenarios. Consequently, we developed a novel framework that integrates two critical components: Text-Image Query Selection and Language-Guided Feature Attention. These modules are designed to bolster the recognition of novel categories and enhance the alignment between visual and linguistic information. Extensive experiments demonstrate the efficacy of our proposed framework, which consistently attains SOTA performance across the OV-VG task. Additionally, ablation studies provide further evidence of the effectiveness of our innovative models. Codes and datasets will be made publicly available at https://github.com/cv516Buaa/OV-VG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14374v1-abstract-full').style.display = 'none'; document.getElementById('2310.14374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07525">arXiv:2310.07525</a> <span> [<a href="https://arxiv.org/pdf/2310.07525">pdf</a>, <a href="https://arxiv.org/format/2310.07525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ViT-A*: Legged Robot Path Planning using Vision Transformer A* </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianwei Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Shirui Lyu</a>, <a href="/search/cs?searchtype=author&query=Hadjivelichkov%2C+D">Denis Hadjivelichkov</a>, <a href="/search/cs?searchtype=author&query=Modugno%2C+V">Valerio Modugno</a>, <a href="/search/cs?searchtype=author&query=Kanoulas%2C+D">Dimitrios Kanoulas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07525v1-abstract-short" style="display: inline;"> Legged robots, particularly quadrupeds, offer promising navigation capabilities, especially in scenarios requiring traversal over diverse terrains and obstacle avoidance. This paper addresses the challenge of enabling legged robots to navigate complex environments effectively through the integration of data-driven path-planning methods. We propose an approach that utilizes differentiable planners,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07525v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07525v1-abstract-full" style="display: none;"> Legged robots, particularly quadrupeds, offer promising navigation capabilities, especially in scenarios requiring traversal over diverse terrains and obstacle avoidance. This paper addresses the challenge of enabling legged robots to navigate complex environments effectively through the integration of data-driven path-planning methods. We propose an approach that utilizes differentiable planners, allowing the learning of end-to-end global plans via a neural network for commanding quadruped robots. The approach leverages 2D maps and obstacle specifications as inputs to generate a global path. To enhance the functionality of the developed neural network-based path planner, we use Vision Transformers (ViT) for map pre-processing, to enable the effective handling of larger maps. Experimental evaluations on two real robotic quadrupeds (Boston Dynamics Spot and Unitree Go1) demonstrate the effectiveness and versatility of the proposed approach in generating reliable path plans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07525v1-abstract-full').style.display = 'none'; document.getElementById('2310.07525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 6 figures, conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE-RAS International Conference on Humanoids Robots (Humanoids) 2023 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lyu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository