Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 77 results for author: <span class="mathjax">Sang, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Sang%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Sang, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Sang%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Sang, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Sang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Sang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Sang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16849">arXiv:2412.16849</a> <span> [<a href="https://arxiv.org/pdf/2412.16849">pdf</a>, <a href="https://arxiv.org/format/2412.16849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenRFT: Adapting Reasoning Foundation Model for Domain-specific Tasks with Reinforcement Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqi Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+J">Jiangming Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinlin Xiao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16849v1-abstract-short" style="display: inline;"> OpenAI's recent introduction of Reinforcement Fine-Tuning (RFT) showcases the potential of reasoning foundation model and offers a new paradigm for fine-tuning beyond simple pattern imitation. This technical report presents \emph{OpenRFT}, our attempt to fine-tune generalist reasoning models for domain-specific tasks under the same settings as RFT. OpenRFT addresses two key challenges of lacking r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16849v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16849v1-abstract-full" style="display: none;"> OpenAI's recent introduction of Reinforcement Fine-Tuning (RFT) showcases the potential of reasoning foundation model and offers a new paradigm for fine-tuning beyond simple pattern imitation. This technical report presents \emph{OpenRFT}, our attempt to fine-tune generalist reasoning models for domain-specific tasks under the same settings as RFT. OpenRFT addresses two key challenges of lacking reasoning step data and the limited quantity of training samples, by leveraging the domain-specific samples in three ways: question augmentation, synthesizing reasoning-process data, and few-shot ICL. The evaluation is conducted on SciKnowEval, where OpenRFT achieves notable performance gains with only $100$ domain-specific samples for each task. More experimental results will be updated continuously in later versions. Source codes, datasets, and models are disclosed at: https://github.com/ADaM-BJTU/OpenRFT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16849v1-abstract-full').style.display = 'none'; document.getElementById('2412.16849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00154">arXiv:2412.00154</a> <span> [<a href="https://arxiv.org/pdf/2412.00154">pdf</a>, <a href="https://arxiv.org/format/2412.00154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> o1-Coder: an o1 Replication for Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangxi Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqi Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+J">Jiangming Shu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinlin Xiao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chao Kong</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00154v2-abstract-short" style="display: inline;"> The technical report introduces O1-CODER, an attempt to replicate OpenAI's o1 model with a focus on coding tasks. It integrates reinforcement learning (RL) and Monte Carlo Tree Search (MCTS) to enhance the model's System-2 thinking capabilities. The framework includes training a Test Case Generator (TCG) for standardized code testing, using MCTS to generate code data with reasoning processes, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00154v2-abstract-full').style.display = 'inline'; document.getElementById('2412.00154v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00154v2-abstract-full" style="display: none;"> The technical report introduces O1-CODER, an attempt to replicate OpenAI's o1 model with a focus on coding tasks. It integrates reinforcement learning (RL) and Monte Carlo Tree Search (MCTS) to enhance the model's System-2 thinking capabilities. The framework includes training a Test Case Generator (TCG) for standardized code testing, using MCTS to generate code data with reasoning processes, and iteratively fine-tuning the policy model to initially produce pseudocode and then generate the full code. The report also addresses the opportunities and challenges in deploying o1-like models in real-world applications, suggesting transitioning to the System-2 paradigm and highlighting the imperative for world model construction. Updated model progress and experimental results will be reported in subsequent versions. All source code, curated datasets, as well as the derived models are disclosed at https://github.com/ADaM-BJTU/O1-CODER . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00154v2-abstract-full').style.display = 'none'; document.getElementById('2412.00154v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17075">arXiv:2411.17075</a> <span> [<a href="https://arxiv.org/pdf/2411.17075">pdf</a>, <a href="https://arxiv.org/format/2411.17075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Don't Command, Cultivate: An Exploratory Study of System-2 Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xinyan Wen</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17075v5-abstract-short" style="display: inline;"> The o1 system card identifies the o1 models as the most robust within OpenAI, with their defining characteristic being the progression from rapid, intuitive thinking to slower, more deliberate reasoning. This observation motivated us to investigate the influence of System-2 thinking patterns on model safety. In our preliminary research, we conducted safety evaluations of the o1 model, including co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17075v5-abstract-full').style.display = 'inline'; document.getElementById('2411.17075v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17075v5-abstract-full" style="display: none;"> The o1 system card identifies the o1 models as the most robust within OpenAI, with their defining characteristic being the progression from rapid, intuitive thinking to slower, more deliberate reasoning. This observation motivated us to investigate the influence of System-2 thinking patterns on model safety. In our preliminary research, we conducted safety evaluations of the o1 model, including complex jailbreak attack scenarios using adversarial natural language prompts and mathematical encoding prompts. Our findings indicate that the o1 model demonstrates relatively improved safety performance; however, it still exhibits vulnerabilities, particularly against jailbreak attacks employing mathematical encoding. Through detailed case analysis, we identified specific patterns in the o1 model's responses. We also explored the alignment of System-2 safety in open-source models using prompt engineering and supervised fine-tuning techniques. Experimental results show that some simple methods to encourage the model to carefully scrutinize user requests are beneficial for model safety. Additionally, we proposed a implementation plan for process supervision to enhance safety alignment. The implementation details and experimental results will be provided in future versions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17075v5-abstract-full').style.display = 'none'; document.getElementById('2411.17075v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In this version, the DPO and reinforcement learning methods have been added</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15839">arXiv:2411.15839</a> <span> [<a href="https://arxiv.org/pdf/2411.15839">pdf</a>, <a href="https://arxiv.org/format/2411.15839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VaLiD: Mitigating the Hallucination of Large Vision Language Models by Visual Layer Fusion Contrastive Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifei Gao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15839v1-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have demonstrated outstanding performance in multimodal task reasoning. However, they often generate responses that appear plausible yet do not accurately reflect the visual content, a phenomenon known as hallucination. Recent approaches have introduced training-free methods that mitigate hallucinations by adjusting the decoding strategy during inference stage,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15839v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15839v1-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have demonstrated outstanding performance in multimodal task reasoning. However, they often generate responses that appear plausible yet do not accurately reflect the visual content, a phenomenon known as hallucination. Recent approaches have introduced training-free methods that mitigate hallucinations by adjusting the decoding strategy during inference stage, typically attributing hallucination to the language model itself. Our analysis, however, reveals that distortions in the visual encoding process significantly affect the model's reasoning accuracy. Specifically, earlier visual layers may retain key features but gradually distort as the information propagates toward the output layer. Building on these findings, we propose a novel hallucination-mitigation method from the visual encoding perspective: \textbf{V}isu\textbf{a}l \textbf{L}ayer Fus\textbf{i}on Contrastive \textbf{D}ecoding (VaLiD). This method utilizes uncertainty to guide the selection of visual hidden layers, correcting distortions in the visual encoding process and thereby improving the reliability of generated text. Experimental results show that VaLiD effectively reduces hallucinations across various benchmarks, achieving state-of-the-art performance compared to multiple baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15839v1-abstract-full').style.display = 'none'; document.getElementById('2411.15839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09365">arXiv:2410.09365</a> <span> [<a href="https://arxiv.org/pdf/2410.09365">pdf</a>, <a href="https://arxiv.org/format/2410.09365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Debiasing Vison-Language Models with Text-Only Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yunfan Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chaoquan Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiyu Lin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinlin Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09365v1-abstract-short" style="display: inline;"> Pre-trained vision-language models (VLMs), such as CLIP, have exhibited remarkable performance across various downstream tasks by aligning text and images in a unified embedding space. However, due to the imbalanced distribution of pre-trained datasets, CLIP suffers from the bias problem in real-world applications. Existing debiasing methods struggle to obtain sufficient image samples for minority… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09365v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09365v1-abstract-full" style="display: none;"> Pre-trained vision-language models (VLMs), such as CLIP, have exhibited remarkable performance across various downstream tasks by aligning text and images in a unified embedding space. However, due to the imbalanced distribution of pre-trained datasets, CLIP suffers from the bias problem in real-world applications. Existing debiasing methods struggle to obtain sufficient image samples for minority groups and incur high costs for group labeling. To address the limitations, we propose a Text-Only Debiasing framework called TOD, leveraging a text-as-image training paradigm to mitigate visual biases. Specifically, this approach repurposes the text encoder to function as an image encoder, thereby eliminating the need for image data. Simultaneously, it utilizes a large language model (LLM) to generate a balanced text dataset, which is then used for prompt tuning. However, we observed that the model overfits to the text modality because label names, serving as supervision signals, appear explicitly in the texts. To address this issue, we further introduce a Multi-Target Prediction (MTP) task that motivates the model to focus on complex contexts and distinguish between target and biased information. Extensive experiments on the Waterbirds and CelebA datasets show that our method significantly improves group robustness, achieving state-of-the-art results among image-free methods and even competitive performance compared to image-supervised methods. Furthermore, the proposed method can be adapted to challenging scenarios with multiple or unknown bias attributes, demonstrating its strong generalization and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09365v1-abstract-full').style.display = 'none'; document.getElementById('2410.09365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05346">arXiv:2410.05346</a> <span> [<a href="https://arxiv.org/pdf/2410.05346">pdf</a>, <a href="https://arxiv.org/format/2410.05346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AnyAttack: Targeted Adversarial Attacks on Vision-Language Models toward Any Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junhong Ye</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yunfan Yang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+D">Dit-Yan Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05346v2-abstract-short" style="display: inline;"> Due to their multimodal capabilities, Vision-Language Models (VLMs) have found numerous impactful applications in real-world scenarios. However, recent studies have revealed that VLMs are vulnerable to image-based adversarial attacks, particularly targeted adversarial images that manipulate the model to generate harmful content specified by the adversary. Current attack methods rely on predefined… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05346v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05346v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05346v2-abstract-full" style="display: none;"> Due to their multimodal capabilities, Vision-Language Models (VLMs) have found numerous impactful applications in real-world scenarios. However, recent studies have revealed that VLMs are vulnerable to image-based adversarial attacks, particularly targeted adversarial images that manipulate the model to generate harmful content specified by the adversary. Current attack methods rely on predefined target labels to create targeted adversarial attacks, which limits their scalability and applicability for large-scale robustness evaluations. In this paper, we propose AnyAttack, a self-supervised framework that generates targeted adversarial images for VLMs without label supervision, allowing any image to serve as a target for the attack. Our framework employs the pre-training and fine-tuning paradigm, with the adversarial noise generator pre-trained on the large-scale LAION-400M dataset. This large-scale pre-training endows our method with powerful transferability across a wide range of VLMs. Extensive experiments on five mainstream open-source VLMs (CLIP, BLIP, BLIP2, InstructBLIP, and MiniGPT-4) across three multimodal tasks (image-text retrieval, multimodal classification, and image captioning) demonstrate the effectiveness of our attack. Additionally, we successfully transfer AnyAttack to multiple commercial VLMs, including Google Gemini, Claude Sonnet, Microsoft Copilot and OpenAI GPT. These results reveal an unprecedented risk to VLMs, highlighting the need for effective countermeasures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05346v2-abstract-full').style.display = 'none'; document.getElementById('2410.05346v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09318">arXiv:2409.09318</a> <span> [<a href="https://arxiv.org/pdf/2409.09318">pdf</a>, <a href="https://arxiv.org/format/2409.09318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ODE: Open-Set Evaluation of Hallucinations in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yahan Tu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09318v3-abstract-short" style="display: inline;"> Hallucination poses a persistent challenge for multimodal large language models (MLLMs). However, existing benchmarks for evaluating hallucinations are generally static, which may overlook the potential risk of data contamination. To address this issue, we propose ODE, an open-set, dynamic protocol designed to evaluate object hallucinations in MLLMs at both the existence and attribute levels. ODE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09318v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09318v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09318v3-abstract-full" style="display: none;"> Hallucination poses a persistent challenge for multimodal large language models (MLLMs). However, existing benchmarks for evaluating hallucinations are generally static, which may overlook the potential risk of data contamination. To address this issue, we propose ODE, an open-set, dynamic protocol designed to evaluate object hallucinations in MLLMs at both the existence and attribute levels. ODE employs a graph-based structure to represent real-world object concepts, their attributes, and the distributional associations between them. This structure facilitates the extraction of concept combinations based on diverse distributional criteria, generating varied samples for structured queries that evaluate hallucinations in both generative and discriminative tasks. Through the generation of new samples, dynamic concept combinations, and varied distribution frequencies, ODE mitigates the risk of data contamination and broadens the scope of evaluation. This protocol is applicable to both general and specialized scenarios, including those with limited data. Experimental results demonstrate the effectiveness of our protocol, revealing that MLLMs exhibit higher hallucination rates when evaluated with ODE-generated samples, which indicates potential data contamination. Furthermore, these generated samples aid in analyzing hallucination patterns and fine-tuning models, offering an effective approach to mitigating hallucinations in MLLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09318v3-abstract-full').style.display = 'none'; document.getElementById('2409.09318v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10334">arXiv:2408.10334</a> <span> [<a href="https://arxiv.org/pdf/2408.10334">pdf</a>, <a href="https://arxiv.org/format/2408.10334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Disguised Wolf Is More Harmful Than a Toothless Tiger: Adaptive Malicious Code Injection Backdoor Attack Leveraging User Behavior as Triggers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangxi Wu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10334v1-abstract-short" style="display: inline;"> In recent years, large language models (LLMs) have made significant progress in the field of code generation. However, as more and more users rely on these models for software development, the security risks associated with code generation models have become increasingly significant. Studies have shown that traditional deep learning robustness issues also negatively impact the field of code genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10334v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10334v1-abstract-full" style="display: none;"> In recent years, large language models (LLMs) have made significant progress in the field of code generation. However, as more and more users rely on these models for software development, the security risks associated with code generation models have become increasingly significant. Studies have shown that traditional deep learning robustness issues also negatively impact the field of code generation. In this paper, we first present the game-theoretic model that focuses on security issues in code generation scenarios. This framework outlines possible scenarios and patterns where attackers could spread malicious code models to create security threats. We also pointed out for the first time that the attackers can use backdoor attacks to dynamically adjust the timing of malicious code injection, which will release varying degrees of malicious code depending on the skill level of the user. Through extensive experiments on leading code generation models, we validate our proposed game-theoretic model and highlight the significant threats that these new attack scenarios pose to the safe use of code models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10334v1-abstract-full').style.display = 'none'; document.getElementById('2408.10334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05868">arXiv:2407.05868</a> <span> [<a href="https://arxiv.org/pdf/2407.05868">pdf</a>, <a href="https://arxiv.org/format/2407.05868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KG-FPQ: Evaluating Factuality Hallucination in LLMs with Knowledge Graph-based False Premise Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinlin Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05868v2-abstract-short" style="display: inline;"> Recent studies have demonstrated that large language models (LLMs) are susceptible to being misled by false premise questions (FPQs), leading to errors in factual knowledge, know as factuality hallucination. Existing benchmarks that assess this vulnerability primarily rely on manual construction, resulting in limited scale and lack of scalability. In this work, we introduce an automated, scalable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05868v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05868v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05868v2-abstract-full" style="display: none;"> Recent studies have demonstrated that large language models (LLMs) are susceptible to being misled by false premise questions (FPQs), leading to errors in factual knowledge, know as factuality hallucination. Existing benchmarks that assess this vulnerability primarily rely on manual construction, resulting in limited scale and lack of scalability. In this work, we introduce an automated, scalable pipeline to create FPQs based on knowledge graphs (KGs). The first step is modifying true triplets extracted from KGs to create false premises. Subsequently, utilizing the state-of-the-art capabilities of GPTs, we generate semantically rich FPQs. Based on the proposed method, we present a comprehensive benchmark, the Knowledge Graph-based False Premise Questions (KG-FPQ), which contains approximately 178k FPQs across three knowledge domains, at six levels of confusability, and in two task formats. Using KG-FPQ, we conduct extensive evaluations on several representative LLMs and provide valuable insights. The KG-FPQ dataset and code are available at~https://github.com/yanxuzhu/KG-FPQ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05868v2-abstract-full').style.display = 'none'; document.getElementById('2407.05868v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLING2025 main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08773">arXiv:2406.08773</a> <span> [<a href="https://arxiv.org/pdf/2406.08773">pdf</a>, <a href="https://arxiv.org/format/2406.08773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DenoiseRep: Denoising Model for Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhengrui Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guan'an Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08773v4-abstract-short" style="display: inline;"> The denoising model has been proven a powerful generative model but has little exploration of discriminative tasks. Representation learning is important in discriminative tasks, which is defined as "learning representations (or features) of the data that make it easier to extract useful information when building classifiers or other predictors". In this paper, we propose a novel Denoising Model fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08773v4-abstract-full').style.display = 'inline'; document.getElementById('2406.08773v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08773v4-abstract-full" style="display: none;"> The denoising model has been proven a powerful generative model but has little exploration of discriminative tasks. Representation learning is important in discriminative tasks, which is defined as "learning representations (or features) of the data that make it easier to extract useful information when building classifiers or other predictors". In this paper, we propose a novel Denoising Model for Representation Learning (DenoiseRep) to improve feature discrimination with joint feature extraction and denoising. DenoiseRep views each embedding layer in a backbone as a denoising layer, processing the cascaded embedding layers as if we are recursively denoise features step-by-step. This unifies the frameworks of feature extraction and denoising, where the former progressively embeds features from low-level to high-level, and the latter recursively denoises features step-by-step. After that, DenoiseRep fuses the parameters of feature extraction and denoising layers, and theoretically demonstrates its equivalence before and after the fusion, thus making feature denoising computation-free. DenoiseRep is a label-free algorithm that incrementally improves features but also complementary to the label if available. Experimental results on various discriminative vision tasks, including re-identification (Market-1501, DukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet, UB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation (ADE20K) show stability and impressive improvements. We also validate its effectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda) architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08773v4-abstract-full').style.display = 'none'; document.getElementById('2406.08773v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01014">arXiv:2406.01014</a> <span> [<a href="https://arxiv.org/pdf/2406.01014">pdf</a>, <a href="https://arxiv.org/format/2406.01014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+H">Haitao Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weizhou Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01014v1-abstract-short" style="display: inline;"> Mobile device operation tasks are increasingly becoming a popular multi-modal AI application scenario. Current Multi-modal Large Language Models (MLLMs), constrained by their training data, lack the capability to function effectively as operation assistants. Instead, MLLM-based agents, which enhance capabilities through tool invocation, are gradually being applied to this scenario. However, the tw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01014v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01014v1-abstract-full" style="display: none;"> Mobile device operation tasks are increasingly becoming a popular multi-modal AI application scenario. Current Multi-modal Large Language Models (MLLMs), constrained by their training data, lack the capability to function effectively as operation assistants. Instead, MLLM-based agents, which enhance capabilities through tool invocation, are gradually being applied to this scenario. However, the two major navigation challenges in mobile device operation tasks, task progress navigation and focus content navigation, are significantly complicated under the single-agent architecture of existing work. This is due to the overly long token sequences and the interleaved text-image data format, which limit performance. To address these navigation challenges effectively, we propose Mobile-Agent-v2, a multi-agent architecture for mobile device operation assistance. The architecture comprises three agents: planning agent, decision agent, and reflection agent. The planning agent generates task progress, making the navigation of history operations more efficient. To retain focus content, we design a memory unit that updates with task progress. Additionally, to correct erroneous operations, the reflection agent observes the outcomes of each operation and handles any mistakes accordingly. Experimental results indicate that Mobile-Agent-v2 achieves over a 30% improvement in task completion compared to the single-agent architecture of Mobile-Agent. The code is open-sourced at https://github.com/X-PLUG/MobileAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01014v1-abstract-full').style.display = 'none'; document.getElementById('2406.01014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 11 figures, 10 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.08035">arXiv:2405.08035</a> <span> [<a href="https://arxiv.org/pdf/2405.08035">pdf</a>, <a href="https://arxiv.org/format/2405.08035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A LLM-based Controllable, Scalable, Human-Involved User Simulator Framework for Conversational Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lixi Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.08035v1-abstract-short" style="display: inline;"> Conversational Recommender System (CRS) leverages real-time feedback from users to dynamically model their preferences, thereby enhancing the system's ability to provide personalized recommendations and improving the overall user experience. CRS has demonstrated significant promise, prompting researchers to concentrate their efforts on developing user simulators that are both more realistic and tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08035v1-abstract-full').style.display = 'inline'; document.getElementById('2405.08035v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.08035v1-abstract-full" style="display: none;"> Conversational Recommender System (CRS) leverages real-time feedback from users to dynamically model their preferences, thereby enhancing the system's ability to provide personalized recommendations and improving the overall user experience. CRS has demonstrated significant promise, prompting researchers to concentrate their efforts on developing user simulators that are both more realistic and trustworthy. The emergence of Large Language Models (LLMs) has marked the onset of a new epoch in computational capabilities, exhibiting human-level intelligence in various tasks. Research efforts have been made to utilize LLMs for building user simulators to evaluate the performance of CRS. Although these efforts showcase innovation, they are accompanied by certain limitations. In this work, we introduce a Controllable, Scalable, and Human-Involved (CSHI) simulator framework that manages the behavior of user simulators across various stages via a plugin manager. CSHI customizes the simulation of user behavior and interactions to provide a more lifelike and convincing user interaction experience. Through experiments and case studies in two conversational recommendation scenarios, we show that our framework can adapt to a variety of conversational recommendation settings and effectively simulate users' personalized preferences. Consequently, our simulator is able to generate feedback that closely mirrors that of real users. This facilitates a reliable assessment of existing CRS studies and promotes the creation of high-quality conversational recommendation datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08035v1-abstract-full').style.display = 'none'; document.getElementById('2405.08035v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17844">arXiv:2404.17844</a> <span> [<a href="https://arxiv.org/pdf/2404.17844">pdf</a>, <a href="https://arxiv.org/format/2404.17844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Recommendation: A Review and an Adversarial Robustness Evaluation Library </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jian Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17844v1-abstract-short" style="display: inline;"> Recently, recommender system has achieved significant success. However, due to the openness of recommender systems, they remain vulnerable to malicious attacks. Additionally, natural noise in training data and issues such as data sparsity can also degrade the performance of recommender systems. Therefore, enhancing the robustness of recommender systems has become an increasingly important research… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17844v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17844v1-abstract-full" style="display: none;"> Recently, recommender system has achieved significant success. However, due to the openness of recommender systems, they remain vulnerable to malicious attacks. Additionally, natural noise in training data and issues such as data sparsity can also degrade the performance of recommender systems. Therefore, enhancing the robustness of recommender systems has become an increasingly important research topic. In this survey, we provide a comprehensive overview of the robustness of recommender systems. Based on our investigation, we categorize the robustness of recommender systems into adversarial robustness and non-adversarial robustness. In the adversarial robustness, we introduce the fundamental principles and classical methods of recommender system adversarial attacks and defenses. In the non-adversarial robustness, we analyze non-adversarial robustness from the perspectives of data sparsity, natural noise, and data imbalance. Additionally, we summarize commonly used datasets and evaluation metrics for evaluating the robustness of recommender systems. Finally, we also discuss the current challenges in the field of recommender system robustness and potential future research directions. Additionally, to facilitate fair and efficient evaluation of attack and defense methods in adversarial robustness, we propose an adversarial robustness evaluation library--ShillingREC, and we conduct evaluations of basic attack models and recommendation models. ShillingREC project is released at https://github.com/chengleileilei/ShillingREC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17844v1-abstract-full').style.display = 'none'; document.getElementById('2404.17844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16305">arXiv:2404.16305</a> <span> [<a href="https://arxiv.org/pdf/2404.16305">pdf</a>, <a href="https://arxiv.org/format/2404.16305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Semantically consistent Video-to-Audio Generation using Multimodal Language Large Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gehui Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guan'an Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16305v2-abstract-short" style="display: inline;"> Existing works have made strides in video generation, but the lack of sound effects (SFX) and background music (BGM) hinders a complete and immersive viewer experience. We introduce a novel semantically consistent v ideo-to-audio generation framework, namely SVA, which automatically generates audio semantically consistent with the given video content. The framework harnesses the power of multimoda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16305v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16305v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16305v2-abstract-full" style="display: none;"> Existing works have made strides in video generation, but the lack of sound effects (SFX) and background music (BGM) hinders a complete and immersive viewer experience. We introduce a novel semantically consistent v ideo-to-audio generation framework, namely SVA, which automatically generates audio semantically consistent with the given video content. The framework harnesses the power of multimodal large language model (MLLM) to understand video semantics from a key frame and generate creative audio schemes, which are then utilized as prompts for text-to-audio models, resulting in video-to-audio generation with natural language as an interface. We show the satisfactory performance of SVA through case study and discuss the limitations along with the future research direction. The project page is available at https://huiz-a.github.io/audio4video.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16305v2-abstract-full').style.display = 'none'; document.getElementById('2404.16305v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10332">arXiv:2404.10332</a> <span> [<a href="https://arxiv.org/pdf/2404.10332">pdf</a>, <a href="https://arxiv.org/format/2404.10332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prescribing the Right Remedy: Mitigating Hallucinations in Large Vision-Language Models via Targeted Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yahan Tu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10332v1-abstract-short" style="display: inline;"> Despite achieving outstanding performance on various cross-modal tasks, current large vision-language models (LVLMs) still suffer from hallucination issues, manifesting as inconsistencies between their generated responses and the corresponding images. Prior research has implicated that the low quality of instruction data, particularly the skewed balance between positive and negative samples, is a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10332v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10332v1-abstract-full" style="display: none;"> Despite achieving outstanding performance on various cross-modal tasks, current large vision-language models (LVLMs) still suffer from hallucination issues, manifesting as inconsistencies between their generated responses and the corresponding images. Prior research has implicated that the low quality of instruction data, particularly the skewed balance between positive and negative samples, is a significant contributor to model hallucinations. Recently, researchers have proposed high-quality instruction datasets, such as LRV-Instruction, to mitigate model hallucination. Nonetheless, our investigation reveals that hallucinatory concepts from different LVLMs exhibit specificity, i.e. the distribution of hallucinatory concepts varies significantly across models. Existing datasets did not consider the hallucination specificity of different models in the design processes, thereby diminishing their efficacy in mitigating model hallucination. In this paper, we propose a targeted instruction data generation framework named DFTG that tailored to the hallucination specificity of different models. Concretely, DFTG consists of two stages: hallucination diagnosis, which extracts the necessary information from the model's responses and images for hallucination diagnosis; and targeted data generation, which generates targeted instruction data based on diagnostic results. The experimental results on hallucination benchmarks demonstrate that the targeted instruction data generated by our method are more effective in mitigating hallucinations compared to previous datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10332v1-abstract-full').style.display = 'none'; document.getElementById('2404.10332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04814">arXiv:2404.04814</a> <span> [<a href="https://arxiv.org/pdf/2404.04814">pdf</a>, <a href="https://arxiv.org/format/2404.04814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Inference-Time Rule Eraser: Fair Recognition via Distilling and Removing Biased Rules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04814v4-abstract-short" style="display: inline;"> Machine learning models often make predictions based on biased features such as gender, race, and other social attributes, posing significant fairness risks, especially in societal applications, such as hiring, banking, and criminal justice. Traditional approaches to addressing this issue involve retraining or fine-tuning neural networks with fairness-aware optimization objectives. However, these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04814v4-abstract-full').style.display = 'inline'; document.getElementById('2404.04814v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04814v4-abstract-full" style="display: none;"> Machine learning models often make predictions based on biased features such as gender, race, and other social attributes, posing significant fairness risks, especially in societal applications, such as hiring, banking, and criminal justice. Traditional approaches to addressing this issue involve retraining or fine-tuning neural networks with fairness-aware optimization objectives. However, these methods can be impractical due to significant computational resources, complex industrial tests, and the associated CO2 footprint. Additionally, regular users often fail to fine-tune models because they lack access to model parameters In this paper, we introduce the Inference-Time Rule Eraser (Eraser), a novel method designed to address fairness concerns by removing biased decision-making rules from deployed models during inference without altering model weights. We begin by establishing a theoretical foundation for modifying model outputs to eliminate biased rules through Bayesian analysis. Next, we present a specific implementation of Eraser that involves two stages: (1) distilling the biased rules from the deployed model into an additional patch model, and (2) removing these biased rules from the output of the deployed model during inference. Extensive experiments validate the effectiveness of our approach, showcasing its superior performance in addressing fairness concerns in AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04814v4-abstract-full').style.display = 'none'; document.getElementById('2404.04814v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18205">arXiv:2403.18205</a> <span> [<a href="https://arxiv.org/pdf/2403.18205">pdf</a>, <a href="https://arxiv.org/format/2403.18205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Privacy Protection Capabilities of Chinese Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqi Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18205v1-abstract-short" style="display: inline;"> Large language models (LLMs), renowned for their impressive capabilities in various tasks, have significantly advanced artificial intelligence. Yet, these advancements have raised growing concerns about privacy and security implications. To address these issues and explain the risks inherent in these models, we have devised a three-tiered progressive framework tailored for evaluating privacy in la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18205v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18205v1-abstract-full" style="display: none;"> Large language models (LLMs), renowned for their impressive capabilities in various tasks, have significantly advanced artificial intelligence. Yet, these advancements have raised growing concerns about privacy and security implications. To address these issues and explain the risks inherent in these models, we have devised a three-tiered progressive framework tailored for evaluating privacy in language systems. This framework consists of progressively complex and in-depth privacy test tasks at each tier. Our primary objective is to comprehensively evaluate the sensitivity of large language models to private information, examining how effectively they discern, manage, and safeguard sensitive data in diverse scenarios. This systematic evaluation helps us understand the degree to which these models comply with privacy protection guidelines and the effectiveness of their inherent safeguards against privacy breaches. Our observations indicate that existing Chinese large language models universally show privacy protection shortcomings. It seems that at the moment this widespread issue is unavoidable and may pose corresponding privacy risks in applications based on these models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18205v1-abstract-full').style.display = 'none'; document.getElementById('2403.18205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16416">arXiv:2403.16416</a> <span> [<a href="https://arxiv.org/pdf/2403.16416">pdf</a>, <a href="https://arxiv.org/format/2403.16416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> How Reliable is Your Simulator? Analysis on the Limitations of Current LLM-based User Simulators for Conversational Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lixi Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowen Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16416v1-abstract-short" style="display: inline;"> Conversational Recommender System (CRS) interacts with users through natural language to understand their preferences and provide personalized recommendations in real-time. CRS has demonstrated significant potential, prompting researchers to address the development of more realistic and reliable user simulators as a key focus. Recently, the capabilities of Large Language Models (LLMs) have attract… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16416v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16416v1-abstract-full" style="display: none;"> Conversational Recommender System (CRS) interacts with users through natural language to understand their preferences and provide personalized recommendations in real-time. CRS has demonstrated significant potential, prompting researchers to address the development of more realistic and reliable user simulators as a key focus. Recently, the capabilities of Large Language Models (LLMs) have attracted a lot of attention in various fields. Simultaneously, efforts are underway to construct user simulators based on LLMs. While these works showcase innovation, they also come with certain limitations that require attention. In this work, we aim to analyze the limitations of using LLMs in constructing user simulators for CRS, to guide future research. To achieve this goal, we conduct analytical validation on the notable work, iEvaLM. Through multiple experiments on two widely-used datasets in the field of conversational recommendation, we highlight several issues with the current evaluation methods for user simulators based on LLMs: (1) Data leakage, which occurs in conversational history and the user simulator's replies, results in inflated evaluation results. (2) The success of CRS recommendations depends more on the availability and quality of conversational history than on the responses from user simulators. (3) Controlling the output of the user simulator through a single prompt template proves challenging. To overcome these limitations, we propose SimpleUserSim, employing a straightforward strategy to guide the topic toward the target items. Our study validates the ability of CRS models to utilize the interaction information, significantly improving the recommendation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16416v1-abstract-full').style.display = 'none'; document.getElementById('2403.16416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08542">arXiv:2403.08542</a> <span> [<a href="https://arxiv.org/pdf/2403.08542">pdf</a>, <a href="https://arxiv.org/format/2403.08542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AIGCs Confuse AI Too: Investigating and Explaining Synthetic Image-induced Hallucinations in Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifei Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiyu Lin</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08542v2-abstract-short" style="display: inline;"> The evolution of Artificial Intelligence Generated Contents (AIGCs) is advancing towards higher quality. The growing interactions with AIGCs present a new challenge to the data-driven AI community: While AI-generated contents have played a crucial role in a wide range of AI models, the potential hidden risks they introduce have not been thoroughly examined. Beyond human-oriented forgery detection,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08542v2-abstract-full').style.display = 'inline'; document.getElementById('2403.08542v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08542v2-abstract-full" style="display: none;"> The evolution of Artificial Intelligence Generated Contents (AIGCs) is advancing towards higher quality. The growing interactions with AIGCs present a new challenge to the data-driven AI community: While AI-generated contents have played a crucial role in a wide range of AI models, the potential hidden risks they introduce have not been thoroughly examined. Beyond human-oriented forgery detection, AI-generated content poses potential issues for AI models originally designed to process natural data. In this study, we underscore the exacerbated hallucination phenomena in Large Vision-Language Models (LVLMs) caused by AI-synthetic images. Remarkably, our findings shed light on a consistent AIGC \textbf{hallucination bias}: the object hallucinations induced by synthetic images are characterized by a greater quantity and a more uniform position distribution, even these synthetic images do not manifest unrealistic or additional relevant visual features compared to natural images. Moreover, our investigations on Q-former and Linear projector reveal that synthetic images may present token deviations after visual projection, thereby amplifying the hallucination bias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08542v2-abstract-full').style.display = 'none'; document.getElementById('2403.08542v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00667">arXiv:2402.00667</a> <span> [<a href="https://arxiv.org/pdf/2402.00667">pdf</a>, <a href="https://arxiv.org/format/2402.00667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving Weak-to-Strong Generalization with Scalable Oversight and Ensemble Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chao Kong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junhong Ye</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shuyu Wei</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinlin Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00667v1-abstract-short" style="display: inline;"> This paper presents a follow-up study to OpenAI's recent superalignment work on Weak-to-Strong Generalization (W2SG). Superalignment focuses on ensuring that high-level AI systems remain consistent with human values and intentions when dealing with complex, high-risk tasks. The W2SG framework has opened new possibilities for empirical research in this evolving field. Our study simulates two phases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00667v1-abstract-full').style.display = 'inline'; document.getElementById('2402.00667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00667v1-abstract-full" style="display: none;"> This paper presents a follow-up study to OpenAI's recent superalignment work on Weak-to-Strong Generalization (W2SG). Superalignment focuses on ensuring that high-level AI systems remain consistent with human values and intentions when dealing with complex, high-risk tasks. The W2SG framework has opened new possibilities for empirical research in this evolving field. Our study simulates two phases of superalignment under the W2SG framework: the development of general superhuman models and the progression towards superintelligence. In the first phase, based on human supervision, the quality of weak supervision is enhanced through a combination of scalable oversight and ensemble learning, reducing the capability gap between weak teachers and strong students. In the second phase, an automatic alignment evaluator is employed as the weak supervisor. By recursively updating this auto aligner, the capabilities of the weak teacher models are synchronously enhanced, achieving weak-to-strong supervision over stronger student models.We also provide an initial validation of the proposed approach for the first phase. Using the SciQ task as example, we explore ensemble learning for weak teacher models through bagging and boosting. Scalable oversight is explored through two auxiliary settings: human-AI interaction and AI-AI debate. Additionally, the paper discusses the impact of improved weak supervision on enhancing weak-to-strong generalization based on in-context learning. Experiment code and dataset will be released at https://github.com/ADaM-BJTU/W2SG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00667v1-abstract-full').style.display = 'none'; document.getElementById('2402.00667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16158">arXiv:2401.16158</a> <span> [<a href="https://arxiv.org/pdf/2401.16158">pdf</a>, <a href="https://arxiv.org/format/2401.16158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiabo Ye</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weizhou Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16158v2-abstract-short" style="display: inline;"> Mobile device agent based on Multimodal Large Language Models (MLLM) is becoming a popular application. In this paper, we introduce Mobile-Agent, an autonomous multi-modal mobile device agent. Mobile-Agent first leverages visual perception tools to accurately identify and locate both the visual and textual elements within the app's front-end interface. Based on the perceived vision context, it the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16158v2-abstract-full').style.display = 'inline'; document.getElementById('2401.16158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16158v2-abstract-full" style="display: none;"> Mobile device agent based on Multimodal Large Language Models (MLLM) is becoming a popular application. In this paper, we introduce Mobile-Agent, an autonomous multi-modal mobile device agent. Mobile-Agent first leverages visual perception tools to accurately identify and locate both the visual and textual elements within the app's front-end interface. Based on the perceived vision context, it then autonomously plans and decomposes the complex operation task, and navigates the mobile Apps through operations step by step. Different from previous solutions that rely on XML files of Apps or mobile system metadata, Mobile-Agent allows for greater adaptability across diverse mobile operating environments in a vision-centric way, thereby eliminating the necessity for system-specific customizations. To assess the performance of Mobile-Agent, we introduced Mobile-Eval, a benchmark for evaluating mobile device operations. Based on Mobile-Eval, we conducted a comprehensive evaluation of Mobile-Agent. The experimental results indicate that Mobile-Agent achieved remarkable accuracy and completion rates. Even with challenging instructions, such as multi-app operations, Mobile-Agent can still complete the requirements. Code and model will be open-sourced at https://github.com/X-PLUG/MobileAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16158v2-abstract-full').style.display = 'none'; document.getElementById('2401.16158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2024 Workshop in Large Language Model (LLM) Agents</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05588">arXiv:2312.05588</a> <span> [<a href="https://arxiv.org/pdf/2312.05588">pdf</a>, <a href="https://arxiv.org/format/2312.05588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Language-assisted Vision Model Debugger: A Sample-Free Approach to Finding and Fixing Bugs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chaoquan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05588v2-abstract-short" style="display: inline;"> Vision models with high overall accuracy often exhibit systematic errors in specific scenarios, posing potential serious safety concerns. Diagnosing bugs of vision models is gaining increased attention, however traditional diagnostic approaches require annotation efforts (eg rich metadata accompanying each samples of CelebA). To address this issue,We propose a language-assisted diagnostic method t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05588v2-abstract-full').style.display = 'inline'; document.getElementById('2312.05588v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05588v2-abstract-full" style="display: none;"> Vision models with high overall accuracy often exhibit systematic errors in specific scenarios, posing potential serious safety concerns. Diagnosing bugs of vision models is gaining increased attention, however traditional diagnostic approaches require annotation efforts (eg rich metadata accompanying each samples of CelebA). To address this issue,We propose a language-assisted diagnostic method that uses texts instead of images to diagnose bugs in vision models based on multi-modal models (eg CLIP). Our approach connects the embedding space of CLIP with the buggy vision model to be diagnosed; meanwhile, utilizing a shared classifier and the cross-modal transferability of embedding space from CLIP, the text-branch of CLIP become a proxy model to find bugs in the buggy model. The proxy model can classify texts paired with images. During the diagnosis, a Large Language Model (LLM) is employed to obtain task-relevant corpora, and this corpora is used to extract keywords. Descriptions constructed with templates containing these keywords serve as input text to probe errors in the proxy model. Finally, we validate the ability to diagnose existing visual models using language on the Waterbirds and CelebA datasets, we can identify bugs comprehensible to human experts, uncovering not only known bugs but also previously unknown ones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05588v2-abstract-full').style.display = 'none'; document.getElementById('2312.05588v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages,8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16421">arXiv:2311.16421</a> <span> [<a href="https://arxiv.org/pdf/2311.16421">pdf</a>, <a href="https://arxiv.org/format/2311.16421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> CDEval: A Benchmark for Measuring the Cultural Dimensions of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chao Kong</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shuyu Wei</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xiaoyuan Yi</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xing Xie</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16421v3-abstract-short" style="display: inline;"> As the scaling of Large Language Models (LLMs) has dramatically enhanced their capabilities, there has been a growing focus on the alignment problem to ensure their responsible and ethical use. While existing alignment efforts predominantly concentrate on universal values such as the HHH principle, the aspect of culture, which is inherently pluralistic and diverse, has not received adequate attent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16421v3-abstract-full').style.display = 'inline'; document.getElementById('2311.16421v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16421v3-abstract-full" style="display: none;"> As the scaling of Large Language Models (LLMs) has dramatically enhanced their capabilities, there has been a growing focus on the alignment problem to ensure their responsible and ethical use. While existing alignment efforts predominantly concentrate on universal values such as the HHH principle, the aspect of culture, which is inherently pluralistic and diverse, has not received adequate attention. This work introduces a new benchmark, CDEval, aimed at evaluating the cultural dimensions of LLMs. CDEval is constructed by incorporating both GPT-4's automated generation and human verification, covering six cultural dimensions across seven domains. Our comprehensive experiments provide intriguing insights into the culture of mainstream LLMs, highlighting both consistencies and variations across different dimensions and domains. The findings underscore the importance of integrating cultural considerations in LLM development, particularly for applications in diverse cultural settings. Through CDEval, we aim to broaden the horizon of LLM alignment research by including cultural dimensions, thus providing a more holistic framework for the future development and evaluation of LLMs. This benchmark serves as a valuable resource for cultural studies in LLMs, paving the way for more culturally aware and sensitive models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16421v3-abstract-full').style.display = 'none'; document.getElementById('2311.16421v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the Cross-Cultural Considerations in NLP Workshop @ ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11261">arXiv:2311.11261</a> <span> [<a href="https://arxiv.org/pdf/2311.11261">pdf</a>, <a href="https://arxiv.org/format/2311.11261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Prompt Tuning for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lingyu Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11261v3-abstract-short" style="display: inline;"> With the rapid advancement of multimodal learning, pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated remarkable capacities in bridging the gap between visual and language modalities. However, these models remain vulnerable to adversarial attacks, particularly in the image modality, presenting considerable security risks. This paper introduces Adversarial Prompt Tuning (AdvPT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11261v3-abstract-full').style.display = 'inline'; document.getElementById('2311.11261v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11261v3-abstract-full" style="display: none;"> With the rapid advancement of multimodal learning, pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated remarkable capacities in bridging the gap between visual and language modalities. However, these models remain vulnerable to adversarial attacks, particularly in the image modality, presenting considerable security risks. This paper introduces Adversarial Prompt Tuning (AdvPT), a novel technique to enhance the adversarial robustness of image encoders in VLMs. AdvPT innovatively leverages learnable text prompts and aligns them with adversarial image embeddings, to address the vulnerabilities inherent in VLMs without the need for extensive parameter training or modification of the model architecture. We demonstrate that AdvPT improves resistance against white-box and black-box adversarial attacks and exhibits a synergistic effect when combined with existing image-processing-based defense techniques, further boosting defensive capabilities. Comprehensive experimental analyses provide insights into adversarial prompt tuning, a novel paradigm devoted to improving resistance to adversarial images through textual input modifications, paving the way for future robust multimodal learning research. These findings open up new possibilities for enhancing the security of VLMs. Our code is available at https://github.com/jiamingzhang94/Adversarial-Prompt-Tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11261v3-abstract-full').style.display = 'none'; document.getElementById('2311.11261v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07397">arXiv:2311.07397</a> <span> [<a href="https://arxiv.org/pdf/2311.07397">pdf</a>, <a href="https://arxiv.org/format/2311.07397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AMBER: An LLM-free Multi-dimensional Benchmark for MLLMs Hallucination Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guohai Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yukai Gu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+H">Haitao Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07397v2-abstract-short" style="display: inline;"> Despite making significant progress in multi-modal tasks, current Multi-modal Large Language Models (MLLMs) encounter the significant challenge of hallucinations, which may lead to harmful consequences. Therefore, evaluating MLLMs' hallucinations is becoming increasingly important in model improvement and practical application deployment. Previous works are limited in high evaluation costs (e.g.,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07397v2-abstract-full').style.display = 'inline'; document.getElementById('2311.07397v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07397v2-abstract-full" style="display: none;"> Despite making significant progress in multi-modal tasks, current Multi-modal Large Language Models (MLLMs) encounter the significant challenge of hallucinations, which may lead to harmful consequences. Therefore, evaluating MLLMs' hallucinations is becoming increasingly important in model improvement and practical application deployment. Previous works are limited in high evaluation costs (e.g., relying on humans or advanced LLMs) and insufficient evaluation dimensions (e.g., types of tasks and hallucinations). In this paper, we propose an LLM-free multi-dimensional benchmark AMBER, which can be used to evaluate both generative task and discriminative task including existence, attribute and relation hallucination. Based on AMBER, we design a low-cost and efficient evaluation pipeline. Additionally, we conduct a comprehensive evaluation and detailed analysis of mainstream MLLMs including GPT-4V(ision), and also give guideline suggestions for mitigating hallucinations. The data and code of AMBER are available at https://github.com/junyangwang0410/AMBER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07397v2-abstract-full').style.display = 'none'; document.getElementById('2311.07397v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13378">arXiv:2310.13378</a> <span> [<a href="https://arxiv.org/pdf/2310.13378">pdf</a>, <a href="https://arxiv.org/format/2310.13378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ScalableMap: Scalable Map Learning for Online Long-Range Vectorized HD Map Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zizhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shengfu Xia</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jizhang Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13378v2-abstract-short" style="display: inline;"> We propose a novel end-to-end pipeline for online long-range vectorized high-definition (HD) map construction using on-board camera sensors. The vectorized representation of HD maps, employing polylines and polygons to represent map elements, is widely used by downstream tasks. However, previous schemes designed with reference to dynamic object detection overlook the structural constraints within… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13378v2-abstract-full').style.display = 'inline'; document.getElementById('2310.13378v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13378v2-abstract-full" style="display: none;"> We propose a novel end-to-end pipeline for online long-range vectorized high-definition (HD) map construction using on-board camera sensors. The vectorized representation of HD maps, employing polylines and polygons to represent map elements, is widely used by downstream tasks. However, previous schemes designed with reference to dynamic object detection overlook the structural constraints within linear map elements, resulting in performance degradation in long-range scenarios. In this paper, we exploit the properties of map elements to improve the performance of map construction. We extract more accurate bird's eye view (BEV) features guided by their linear structure, and then propose a hierarchical sparse map representation to further leverage the scalability of vectorized map elements and design a progressive decoding mechanism and a supervision strategy based on this representation. Our approach, ScalableMap, demonstrates superior performance on the nuScenes dataset, especially in long-range scenarios, surpassing previous state-of-the-art model by 6.5 mAP while achieving 18.3 FPS. Code is available at https://github.com/jingy1yu/ScalableMap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13378v2-abstract-full').style.display = 'none'; document.getElementById('2310.13378v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Conference on Robot Learning. PMLR, 2023: 2429-2443 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13266">arXiv:2310.13266</a> <span> [<a href="https://arxiv.org/pdf/2310.13266">pdf</a>, <a href="https://arxiv.org/ps/2310.13266">ps</a>, <a href="https://arxiv.org/format/2310.13266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Measurement-Based Small-Scale Channel Model for Sub-6 GHz RIS-Assisted Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jian Sang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jifeng Lan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingyong Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boning Gao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wankai Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Matthaiou%2C+M">Michail Matthaiou</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/cs?searchtype=author&query=Di+Renzo%2C+M">Marco Di Renzo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13266v3-abstract-short" style="display: inline;"> Reconfigurable intelligent surfaces (RISs) have attracted increasing interest from both academia and industry, thanks to their unique features on controlling electromagnetic (EM) waves. Although theoretical models for RIS-empowered communications have covered a variety of applications, yet, very few papers have investigated the modeling of real propagation characteristics. In this paper, we fill t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13266v3-abstract-full').style.display = 'inline'; document.getElementById('2310.13266v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13266v3-abstract-full" style="display: none;"> Reconfigurable intelligent surfaces (RISs) have attracted increasing interest from both academia and industry, thanks to their unique features on controlling electromagnetic (EM) waves. Although theoretical models for RIS-empowered communications have covered a variety of applications, yet, very few papers have investigated the modeling of real propagation characteristics. In this paper, we fill this gap by providing an empirical statistical channel model to describe the small-scale channel variations for an RIS-assisted broadband system at 2.6 GHz. Based on real channel measurements in outdoor, indoor and outdoor-to-indoor (O2I) environments, we compare and analyze the global, inter-cluster and intra-cluster parameters. Measurement results indicate that the deployment of an RIS with proper phase configurations can significantly improve the channel quality by enhancing the $K$-factor and reducing the time dispersion. The small-scale fading is well characterized by the proposed statistical model and the empirical channel parameters. These results are essential for the design of emerging RIS-assisted wireless systems for future applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13266v3-abstract-full').style.display = 'none'; document.getElementById('2310.13266v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.15126">arXiv:2308.15126</a> <span> [<a href="https://arxiv.org/pdf/2308.15126">pdf</a>, <a href="https://arxiv.org/format/2308.15126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluation and Analysis of Hallucination in Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yiyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guohai Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+P">Pengcheng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qinghao Ye</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haoyu Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.15126v3-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have recently achieved remarkable success. However, LVLMs are still plagued by the hallucination problem, which limits the practicality in many scenarios. Hallucination refers to the information of LVLMs' responses that does not exist in the visual input, which poses potential risks of substantial consequences. There has been limited work studying hallucination… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15126v3-abstract-full').style.display = 'inline'; document.getElementById('2308.15126v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.15126v3-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have recently achieved remarkable success. However, LVLMs are still plagued by the hallucination problem, which limits the practicality in many scenarios. Hallucination refers to the information of LVLMs' responses that does not exist in the visual input, which poses potential risks of substantial consequences. There has been limited work studying hallucination evaluation in LVLMs. In this paper, we propose Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based hallucination evaluation framework. HaELM achieves an approximate 95% performance comparable to ChatGPT and has additional advantages including low cost, reproducibility, privacy preservation and local deployment. Leveraging the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we analyze the factors contributing to hallucination in LVLMs and offer helpful suggestions to mitigate the hallucination problem. Our training data and human annotation hallucination data will be made public soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15126v3-abstract-full').style.display = 'none'; document.getElementById('2308.15126v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14002">arXiv:2308.14002</a> <span> [<a href="https://arxiv.org/pdf/2308.14002">pdf</a>, <a href="https://arxiv.org/format/2308.14002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Physics Education">physics.ed-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Hands-on Quantum Programming Labs for EECS Students </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sang%2C+J">Janche Sang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chansu Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14002v5-abstract-short" style="display: inline;"> This report presents a practical approach to teaching quantum computing to Electrical Engineering & Computer Science (EECS) students through dedicated hands-on programming labs. The labs cover a diverse range of topics, encompassing fundamental elements, such as entanglement, quantum gates and circuits, as well as advanced algorithms including Quantum Key Distribution, Deutsch and Deutsch-Jozsa Al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14002v5-abstract-full').style.display = 'inline'; document.getElementById('2308.14002v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14002v5-abstract-full" style="display: none;"> This report presents a practical approach to teaching quantum computing to Electrical Engineering & Computer Science (EECS) students through dedicated hands-on programming labs. The labs cover a diverse range of topics, encompassing fundamental elements, such as entanglement, quantum gates and circuits, as well as advanced algorithms including Quantum Key Distribution, Deutsch and Deutsch-Jozsa Algorithms, Simon's algorithm, and Grover's algorithm. As educators, we aim to share our teaching insights and resources with fellow instructors in the field. The full lab handouts and program templates are provided for interested instructors. Furthermore, the report elucidates the rationale behind the design of each experiment, enabling a deeper understanding of quantum computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14002v5-abstract-full').style.display = 'none'; document.getElementById('2308.14002v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Program templates have been updated based on Qiskit version 1.2. Latex handouts are located in the subdir 'handouts_latex,' and program templates can be found in the 'pub' folder</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.08482">arXiv:2308.08482</a> <span> [<a href="https://arxiv.org/pdf/2308.08482">pdf</a>, <a href="https://arxiv.org/format/2308.08482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3612317">10.1145/3581783.3612317 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Benign Shortcut for Debiasing: Fair Visual Recognition via Intervention with Shortcut Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.08482v1-abstract-short" style="display: inline;"> Machine learning models often learn to make predictions that rely on sensitive social attributes like gender and race, which poses significant fairness risks, especially in societal applications, such as hiring, banking, and criminal justice. Existing work tackles this issue by minimizing the employed information about social attributes in models for debiasing. However, the high correlation betwee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08482v1-abstract-full').style.display = 'inline'; document.getElementById('2308.08482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.08482v1-abstract-full" style="display: none;"> Machine learning models often learn to make predictions that rely on sensitive social attributes like gender and race, which poses significant fairness risks, especially in societal applications, such as hiring, banking, and criminal justice. Existing work tackles this issue by minimizing the employed information about social attributes in models for debiasing. However, the high correlation between target task and these social attributes makes learning on the target task incompatible with debiasing. Given that model bias arises due to the learning of bias features (\emph{i.e}., gender) that help target task optimization, we explore the following research question: \emph{Can we leverage shortcut features to replace the role of bias feature in target task optimization for debiasing?} To this end, we propose \emph{Shortcut Debiasing}, to first transfer the target task's learning of bias attributes from bias features to shortcut features, and then employ causal intervention to eliminate shortcut features during inference. The key idea of \emph{Shortcut Debiasing} is to design controllable shortcut features to on one hand replace bias features in contributing to the target task during the training stage, and on the other hand be easily removed by intervention during the inference stage. This guarantees the learning of the target task does not hinder the elimination of bias features. We apply \emph{Shortcut Debiasing} to several benchmark datasets, and achieve significant improvements over the state-of-the-art debiasing methods in both accuracy and fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08482v1-abstract-full').style.display = 'none'; document.getElementById('2308.08482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2211.01253</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02168">arXiv:2308.02168</a> <span> [<a href="https://arxiv.org/pdf/2308.02168">pdf</a>, <a href="https://arxiv.org/format/2308.02168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> You talk what you read: Understanding News Comment Behavior by Dispositional and Situational Attribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02168v1-abstract-short" style="display: inline;"> Many news comment mining studies are based on the assumption that comment is explicitly linked to the corresponding news. In this paper, we observed that users' comments are also heavily influenced by their individual characteristics embodied by the interaction history. Therefore, we position to understand news comment behavior by considering both the dispositional factors from news interaction hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02168v1-abstract-full').style.display = 'inline'; document.getElementById('2308.02168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02168v1-abstract-full" style="display: none;"> Many news comment mining studies are based on the assumption that comment is explicitly linked to the corresponding news. In this paper, we observed that users' comments are also heavily influenced by their individual characteristics embodied by the interaction history. Therefore, we position to understand news comment behavior by considering both the dispositional factors from news interaction history, and the situational factors from corresponding news. A three-part encoder-decoder framework is proposed to model the generative process of news comment. The resultant dispositional and situational attribution contributes to understanding user focus and opinions, which are validated in applications of reader-aware news summarization and news aspect-opinion forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02168v1-abstract-full').style.display = 'none'; document.getElementById('2308.02168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09705">arXiv:2307.09705</a> <span> [<a href="https://arxiv.org/pdf/2307.09705">pdf</a>, <a href="https://arxiv.org/format/2307.09705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CValues: Measuring the Values of Chinese Large Language Models from Safety to Responsibility </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guohai Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiayi Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haotian Xu</a>, <a href="/search/cs?searchtype=author&query=Si%2C+J">Jinghui Si</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhuoran Zhou</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+P">Peng Yi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xing Gao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09705v1-abstract-short" style="display: inline;"> With the rapid evolution of large language models (LLMs), there is a growing concern that they may pose risks or have negative social impacts. Therefore, evaluation of human values alignment is becoming increasingly important. Previous work mainly focuses on assessing the performance of LLMs on certain knowledge and reasoning abilities, while neglecting the alignment to human values, especially in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09705v1-abstract-full').style.display = 'inline'; document.getElementById('2307.09705v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09705v1-abstract-full" style="display: none;"> With the rapid evolution of large language models (LLMs), there is a growing concern that they may pose risks or have negative social impacts. Therefore, evaluation of human values alignment is becoming increasingly important. Previous work mainly focuses on assessing the performance of LLMs on certain knowledge and reasoning abilities, while neglecting the alignment to human values, especially in a Chinese context. In this paper, we present CValues, the first Chinese human values evaluation benchmark to measure the alignment ability of LLMs in terms of both safety and responsibility criteria. As a result, we have manually collected adversarial safety prompts across 10 scenarios and induced responsibility prompts from 8 domains by professional experts. To provide a comprehensive values evaluation of Chinese LLMs, we not only conduct human evaluation for reliable comparison, but also construct multi-choice prompts for automatic evaluation. Our findings suggest that while most Chinese LLMs perform well in terms of safety, there is considerable room for improvement in terms of responsibility. Moreover, both the automatic and human evaluation are important for assessing the human values alignment in different aspects. The benchmark and code is available on ModelScope and Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09705v1-abstract-full').style.display = 'none'; document.getElementById('2307.09705v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in Process</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.06608">arXiv:2307.06608</a> <span> [<a href="https://arxiv.org/pdf/2307.06608">pdf</a>, <a href="https://arxiv.org/format/2307.06608">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Introducing Foundation Models as Surrogate Models: Advancing Towards More Practical Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Q">Qi Yi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Changsheng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.06608v2-abstract-short" style="display: inline;"> Recently, the no-box adversarial attack, in which the attacker lacks access to the model's architecture, weights, and training data, become the most practical and challenging attack setup. However, there is an unawareness of the potential and flexibility inherent in the surrogate model selection process on no-box setting. Inspired by the burgeoning interest in utilizing foundational models to addr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.06608v2-abstract-full').style.display = 'inline'; document.getElementById('2307.06608v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.06608v2-abstract-full" style="display: none;"> Recently, the no-box adversarial attack, in which the attacker lacks access to the model's architecture, weights, and training data, become the most practical and challenging attack setup. However, there is an unawareness of the potential and flexibility inherent in the surrogate model selection process on no-box setting. Inspired by the burgeoning interest in utilizing foundational models to address downstream tasks, this paper adopts an innovative idea that 1) recasting adversarial attack as a downstream task. Specifically, image noise generation to meet the emerging trend and 2) introducing foundational models as surrogate models. Harnessing the concept of non-robust features, we elaborate on two guiding principles for surrogate model selection to explain why the foundational model is an optimal choice for this role. However, paradoxically, we observe that these foundational models underperform. Analyzing this unexpected behavior within the feature space, we attribute the lackluster performance of foundational models (e.g., CLIP) to their significant representational capacity and, conversely, their lack of discriminative prowess. To mitigate this issue, we propose the use of a margin-based loss strategy for the fine-tuning of foundational models on target images. The experimental results verify that our approach, which employs the basic Fast Gradient Sign Method (FGSM) attack algorithm, outstrips the performance of other, more convoluted algorithms. We conclude by advocating for the research community to consider surrogate models as crucial determinants in the effectiveness of adversarial attacks in no-box settings. The implications of our work bear relevance for improving the efficacy of such adversarial attacks and the overall robustness of AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.06608v2-abstract-full').style.display = 'none'; document.getElementById('2307.06608v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03378">arXiv:2306.03378</a> <span> [<a href="https://arxiv.org/pdf/2306.03378">pdf</a>, <a href="https://arxiv.org/format/2306.03378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Towards Alleviating the Object Bias in Prompt Tuning-based Factual Knowledge Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhang Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chao Kong</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03378v2-abstract-short" style="display: inline;"> Many works employed prompt tuning methods to automatically optimize prompt queries and extract the factual knowledge stored in Pretrained Language Models. In this paper, we observe that the optimized prompts, including discrete prompts and continuous prompts, exhibit undesirable object bias. To handle this problem, we propose a novel prompt tuning method called MeCoD. consisting of three modules:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03378v2-abstract-full').style.display = 'inline'; document.getElementById('2306.03378v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03378v2-abstract-full" style="display: none;"> Many works employed prompt tuning methods to automatically optimize prompt queries and extract the factual knowledge stored in Pretrained Language Models. In this paper, we observe that the optimized prompts, including discrete prompts and continuous prompts, exhibit undesirable object bias. To handle this problem, we propose a novel prompt tuning method called MeCoD. consisting of three modules: Prompt Encoder, Object Equalization and Biased Object Obstruction. Experimental results show that MeCoD can significantly reduce the object bias and at the same time improve accuracy of factual knowledge extraction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03378v2-abstract-full').style.display = 'none'; document.getElementById('2306.03378v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2023 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02021">arXiv:2306.02021</a> <span> [<a href="https://arxiv.org/pdf/2306.02021">pdf</a>, <a href="https://arxiv.org/format/2306.02021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Black-box Adversarial Example Detection: A Data Reconstruction-based Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifei Gao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiyu Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yunfan Yang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02021v1-abstract-short" style="display: inline;"> Adversarial example detection is known to be an effective adversarial defense method. Black-box attack, which is a more realistic threat and has led to various black-box adversarial training-based defense methods, however, does not attract considerable attention in adversarial example detection. In this paper, we fill this gap by positioning the problem of black-box adversarial example detection (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02021v1-abstract-full').style.display = 'inline'; document.getElementById('2306.02021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02021v1-abstract-full" style="display: none;"> Adversarial example detection is known to be an effective adversarial defense method. Black-box attack, which is a more realistic threat and has led to various black-box adversarial training-based defense methods, however, does not attract considerable attention in adversarial example detection. In this paper, we fill this gap by positioning the problem of black-box adversarial example detection (BAD). Data analysis under the introduced BAD settings demonstrates (1) the incapability of existing detectors in addressing the black-box scenario and (2) the potential of exploring BAD solutions from a data perspective. To tackle the BAD problem, we propose a data reconstruction-based adversarial example detection method. Specifically, we use variational auto-encoder (VAE) to capture both pixel and frequency representations of normal examples. Then we use reconstruction error to detect adversarial examples. Compared with existing detection methods, the proposed method achieves substantially better detection performance in BAD, which helps promote the deployment of adversarial example detection-based defense solutions in real-world models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02021v1-abstract-full').style.display = 'none'; document.getElementById('2306.02021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 8 figures, 13 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.07835">arXiv:2305.07835</a> <span> [<a href="https://arxiv.org/pdf/2305.07835">pdf</a>, <a href="https://arxiv.org/format/2305.07835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Multi-Scenario Broadband Channel Measurement and Modeling for Sub-6 GHz RIS-Assisted Wireless Communication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jian Sang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingyong Zhou</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jifeng Lan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boning Gao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wankai Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/cs?searchtype=author&query=Basar%2C+E">Ertugrul Basar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cen Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qiang Cheng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+T+J">Tie Jun Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.07835v1-abstract-short" style="display: inline;"> Reconfigurable intelligent surface (RIS)-empowered communication, has been considered widely as one of the revolutionary technologies for next generation networks. However, due to the novel propagation characteristics of RISs, underlying RIS channel modeling and measurement research is still in its infancy and not fully investigated. In this paper, we conduct multi-scenario broadband channel measu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07835v1-abstract-full').style.display = 'inline'; document.getElementById('2305.07835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.07835v1-abstract-full" style="display: none;"> Reconfigurable intelligent surface (RIS)-empowered communication, has been considered widely as one of the revolutionary technologies for next generation networks. However, due to the novel propagation characteristics of RISs, underlying RIS channel modeling and measurement research is still in its infancy and not fully investigated. In this paper, we conduct multi-scenario broadband channel measurements and modeling for RIS-assisted communications at the sub-6 GHz band. The measurements are carried out in three scenarios covering outdoor, indoor, and outdoor-to-indoor (O2I) environments, which suffer from non-line-of-sight (NLOS) propagation inherently. Three propagation modes including intelligent reflection with RIS, specular reflection with RIS and the mode without RIS, are taken into account in each scenario. In addition, considering the cascaded characteristics of RIS-assisted channel by nature, two modified empirical models including floating-intercept (FI) and close-in (CI) are proposed, which cover distance and angle domains. The measurement results rooted in 2096 channel acquisitions verify the prediction accuracy of these proposed models. Moreover, the propagation characteristics for RIS-assisted channels, including path loss (PL) gain, PL exponent, spatial consistency, time dispersion, frequency stationarity, etc., are compared and analyzed comprehensively. These channel measurement and modeling results may lay the groundwork for future applications of RIS-assisted communication systems in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07835v1-abstract-full').style.display = 'none'; document.getElementById('2305.07835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.04043">arXiv:2305.04043</a> <span> [<a href="https://arxiv.org/pdf/2305.04043">pdf</a>, <a href="https://arxiv.org/format/2305.04043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3612312">10.1145/3581783.3612312 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Echoes: Unsupervised Debiasing via Pseudo-bias Labeling in an Echo Chamber </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yahan Tu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.04043v2-abstract-short" style="display: inline;"> Neural networks often learn spurious correlations when exposed to biased training data, leading to poor performance on out-of-distribution data. A biased dataset can be divided, according to biased features, into bias-aligned samples (i.e., with biased features) and bias-conflicting samples (i.e., without biased features). Recent debiasing works typically assume that no bias label is available dur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04043v2-abstract-full').style.display = 'inline'; document.getElementById('2305.04043v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.04043v2-abstract-full" style="display: none;"> Neural networks often learn spurious correlations when exposed to biased training data, leading to poor performance on out-of-distribution data. A biased dataset can be divided, according to biased features, into bias-aligned samples (i.e., with biased features) and bias-conflicting samples (i.e., without biased features). Recent debiasing works typically assume that no bias label is available during the training phase, as obtaining such information is challenging and labor-intensive. Following this unsupervised assumption, existing methods usually train two models: a biased model specialized to learn biased features and a target model that uses information from the biased model for debiasing. This paper first presents experimental analyses revealing that the existing biased models overfit to bias-conflicting samples in the training data, which negatively impacts the debiasing performance of the target models. To address this issue, we propose a straightforward and effective method called Echoes, which trains a biased model and a target model with a different strategy. We construct an "echo chamber" environment by reducing the weights of samples which are misclassified by the biased model, to ensure the biased model fully learns the biased features without overfitting to the bias-conflicting samples. The biased model then assigns lower weights on the bias-conflicting samples. Subsequently, we use the inverse of the sample weights of the biased model for training the target model. Experiments show that our approach achieves superior debiasing results compared to the existing baselines on both synthetic and real-world datasets. Our code is available at https://github.com/isruihu/Echoes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04043v2-abstract-full').style.display = 'none'; document.getElementById('2305.04043v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.13273">arXiv:2304.13273</a> <span> [<a href="https://arxiv.org/pdf/2304.13273">pdf</a>, <a href="https://arxiv.org/format/2304.13273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Association to Generation: Text-only Captioning by Unsupervised Cross-modal Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.13273v3-abstract-short" style="display: inline;"> With the development of Vision-Language Pre-training Models (VLPMs) represented by CLIP and ALIGN, significant breakthroughs have been achieved for association-based visual tasks such as image classification and image-text retrieval by the zero-shot capability of CLIP without fine-tuning. However, CLIP is hard to apply to generation-based tasks. This is due to the lack of decoder architecture and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13273v3-abstract-full').style.display = 'inline'; document.getElementById('2304.13273v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.13273v3-abstract-full" style="display: none;"> With the development of Vision-Language Pre-training Models (VLPMs) represented by CLIP and ALIGN, significant breakthroughs have been achieved for association-based visual tasks such as image classification and image-text retrieval by the zero-shot capability of CLIP without fine-tuning. However, CLIP is hard to apply to generation-based tasks. This is due to the lack of decoder architecture and pre-training tasks for generation. Although previous works have created generation capacity for CLIP through additional language models, a modality gap between the CLIP representations of different modalities and the inability of CLIP to model the offset of this gap, which fails the concept to transfer across modalities. To solve the problem, we try to map images/videos to the language modality and generate captions from the language modality. In this paper, we propose the K-nearest-neighbor Cross-modality Mapping (Knight), a zero-shot method from association to generation. With text-only unsupervised training, Knight achieves State-of-the-Art performance in zero-shot methods for image captioning and video captioning. Our code is available at https://github.com/junyangwang0410/Knight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13273v3-abstract-full').style.display = 'none'; document.getElementById('2304.13273v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 15 figures, has been accepted by IJCAI 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01489">arXiv:2304.01489</a> <span> [<a href="https://arxiv.org/pdf/2304.01489">pdf</a>, <a href="https://arxiv.org/format/2304.01489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improved Visual Fine-tuning with Natural Language Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuanhong Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Juhua Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Q">Qi Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01489v2-abstract-short" style="display: inline;"> Fine-tuning a visual pre-trained model can leverage the semantic information from large-scale pre-training data and mitigate the over-fitting problem on downstream vision tasks with limited training examples. While the problem of catastrophic forgetting in pre-trained backbone has been extensively studied for fine-tuning, its potential bias from the corresponding pre-training task and data, attrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01489v2-abstract-full').style.display = 'inline'; document.getElementById('2304.01489v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01489v2-abstract-full" style="display: none;"> Fine-tuning a visual pre-trained model can leverage the semantic information from large-scale pre-training data and mitigate the over-fitting problem on downstream vision tasks with limited training examples. While the problem of catastrophic forgetting in pre-trained backbone has been extensively studied for fine-tuning, its potential bias from the corresponding pre-training task and data, attracts less attention. In this work, we investigate this problem by demonstrating that the obtained classifier after fine-tuning will be close to that induced by the pre-trained model. To reduce the bias in the classifier effectively, we introduce a reference distribution obtained from a fixed text classifier, which can help regularize the learned vision classifier. The proposed method, Text Supervised fine-tuning (TeS), is evaluated with diverse pre-trained vision models including ResNet and ViT, and text encoders including BERT and CLIP, on 11 downstream tasks. The consistent improvement with a clear margin over distinct scenarios confirms the effectiveness of our proposal. Code is available at \url{https://github.com/idstcv/TeS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01489v2-abstract-full').style.display = 'none'; document.getElementById('2304.01489v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICCV'23</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.13046">arXiv:2303.13046</a> <span> [<a href="https://arxiv.org/pdf/2303.13046">pdf</a>, <a href="https://arxiv.org/format/2303.13046">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Quantized Phase Alignment by Discrete Phase Shifts for Reconfigurable Intelligent Surface-Assisted Communication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jian Sang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jifeng Lan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingyong Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boning Gao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wankai Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xinping Yi</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.13046v1-abstract-short" style="display: inline;"> Reconfigurable intelligent surface (RIS) has aroused a surge of interest in recent years. In this paper, we investigate the joint phase alignment and phase quantization on discrete phase shift designs for RIS-assisted single-input single-output (SISO) system. Firstly, the phenomena of phase distribution in far field and near field are respectively unveiled, paving the way for discretization of pha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13046v1-abstract-full').style.display = 'inline'; document.getElementById('2303.13046v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.13046v1-abstract-full" style="display: none;"> Reconfigurable intelligent surface (RIS) has aroused a surge of interest in recent years. In this paper, we investigate the joint phase alignment and phase quantization on discrete phase shift designs for RIS-assisted single-input single-output (SISO) system. Firstly, the phenomena of phase distribution in far field and near field are respectively unveiled, paving the way for discretization of phase shift for RIS. Then, aiming at aligning phases, the phase distribution law and its underlying degree-of-freedom (DoF) are characterized, serving as the guideline of phase quantization strategies. Subsequently, two phase quantization methods, dynamic threshold phase quantization (DTPQ) and equal interval phase quantization (EIPQ), are proposed to strengthen the beamforming effect of RIS. DTPQ is capable of calculating the optimal discrete phase shifts with linear complexity in the number of unit cells on RIS, whilst EIPQ is a simplified method with a constant complexity yielding sub-optimal solution. Simulation results demonstrate that both methods achieve substantial improvements on power gain, stability, and robustness over traditional quantization methods. The path loss (PL) scaling law under discrete phase shift of RIS is unveiled for the first time, with the phase shifts designed by DTPQ due to its optimality. Additionally, the field trials conducted at 2.6 GHz and 35 GHz validate the favourable performance of the proposed methods in practical communication environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13046v1-abstract-full').style.display = 'none'; document.getElementById('2303.13046v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.01504">arXiv:2303.01504</a> <span> [<a href="https://arxiv.org/pdf/2303.01504">pdf</a>, <a href="https://arxiv.org/format/2303.01504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Backdoor for Debias: Mitigating Model Bias with Backdoor Attack-based Artificial Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangxi Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qiuyang He</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jian Yu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01504v3-abstract-short" style="display: inline;"> With the swift advancement of deep learning, state-of-the-art algorithms have been utilized in various social situations. Nonetheless, some algorithms have been discovered to exhibit biases and provide unequal results. The current debiasing methods face challenges such as poor utilization of data or intricate training requirements. In this work, we found that the backdoor attack can construct an a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01504v3-abstract-full').style.display = 'inline'; document.getElementById('2303.01504v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01504v3-abstract-full" style="display: none;"> With the swift advancement of deep learning, state-of-the-art algorithms have been utilized in various social situations. Nonetheless, some algorithms have been discovered to exhibit biases and provide unequal results. The current debiasing methods face challenges such as poor utilization of data or intricate training requirements. In this work, we found that the backdoor attack can construct an artificial bias similar to the model bias derived in standard training. Considering the strong adjustability of backdoor triggers, we are motivated to mitigate the model bias by carefully designing reverse artificial bias created from backdoor attack. Based on this, we propose a backdoor debiasing framework based on knowledge distillation, which effectively reduces the model bias from original data and minimizes security risks from the backdoor attack. The proposed solution is validated on both image and structured datasets, showing promising results. This work advances the understanding of backdoor attacks and highlights its potential for beneficial applications. The code for the study can be found at \url{https://anonymous.4open.science/r/DwB-BC07/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01504v3-abstract-full').style.display = 'none'; document.getElementById('2303.01504v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.01217">arXiv:2301.01217</a> <span> [<a href="https://arxiv.org/pdf/2301.01217">pdf</a>, <a href="https://arxiv.org/format/2301.01217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unlearnable Clusters: Towards Label-agnostic Unlearnable Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Q">Qi Yi</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Changsheng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.01217v4-abstract-short" style="display: inline;"> There is a growing interest in developing unlearnable examples (UEs) against visual privacy leaks on the Internet. UEs are training samples added with invisible but unlearnable noise, which have been found can prevent unauthorized training of machine learning models. UEs typically are generated via a bilevel optimization framework with a surrogate model to remove (minimize) errors from the origina… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.01217v4-abstract-full').style.display = 'inline'; document.getElementById('2301.01217v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.01217v4-abstract-full" style="display: none;"> There is a growing interest in developing unlearnable examples (UEs) against visual privacy leaks on the Internet. UEs are training samples added with invisible but unlearnable noise, which have been found can prevent unauthorized training of machine learning models. UEs typically are generated via a bilevel optimization framework with a surrogate model to remove (minimize) errors from the original samples, and then applied to protect the data against unknown target models. However, existing UE generation methods all rely on an ideal assumption called label-consistency, where the hackers and protectors are assumed to hold the same label for a given sample. In this work, we propose and promote a more practical label-agnostic setting, where the hackers may exploit the protected data quite differently from the protectors. E.g., a m-class unlearnable dataset held by the protector may be exploited by the hacker as a n-class dataset. Existing UE generation methods are rendered ineffective in this challenging setting. To tackle this challenge, we present a novel technique called Unlearnable Clusters (UCs) to generate label-agnostic unlearnable examples with cluster-wise perturbations. Furthermore, we propose to leverage VisionandLanguage Pre-trained Models (VLPMs) like CLIP as the surrogate model to improve the transferability of the crafted UCs to diverse domains. We empirically verify the effectiveness of our proposed approach under a variety of settings with different datasets, target models, and even commercial platforms Microsoft Azure and Baidu PaddlePaddle. Code is available at \url{https://github.com/jiamingzhang94/Unlearnable-Clusters}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.01217v4-abstract-full').style.display = 'none'; document.getElementById('2301.01217v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03412">arXiv:2212.03412</a> <span> [<a href="https://arxiv.org/pdf/2212.03412">pdf</a>, <a href="https://arxiv.org/format/2212.03412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Artificial Intelligence Security Competition (AISC) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Senyou Deng</a>, <a href="/search/cs?searchtype=author&query=L%2C+L">Lianji L</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxing Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yunteng Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yangyi Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+E">Enhui Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jincai Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shu Xu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xuelin Fu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Changfeng Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Haoliang Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuchong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhimin Sun</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Junyi Cao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jian Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianpeng Wu</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03412v1-abstract-short" style="display: inline;"> The security of artificial intelligence (AI) is an important research area towards safe, reliable, and trustworthy AI systems. To accelerate the research on AI security, the Artificial Intelligence Security Competition (AISC) was organized by the Zhongguancun Laboratory, China Industrial Control Systems Cyber Emergency Response Team, Institute for Artificial Intelligence, Tsinghua University, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03412v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03412v1-abstract-full" style="display: none;"> The security of artificial intelligence (AI) is an important research area towards safe, reliable, and trustworthy AI systems. To accelerate the research on AI security, the Artificial Intelligence Security Competition (AISC) was organized by the Zhongguancun Laboratory, China Industrial Control Systems Cyber Emergency Response Team, Institute for Artificial Intelligence, Tsinghua University, and RealAI as part of the Zhongguancun International Frontier Technology Innovation Competition (https://www.zgc-aisc.com/en). The competition consists of three tracks, including Deepfake Security Competition, Autonomous Driving Security Competition, and Face Recognition Security Competition. This report will introduce the competition rules of these three tracks and the solutions of top-ranking teams in each track. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03412v1-abstract-full').style.display = 'none'; document.getElementById('2212.03412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report of AISC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.07275">arXiv:2211.07275</a> <span> [<a href="https://arxiv.org/pdf/2211.07275">pdf</a>, <a href="https://arxiv.org/format/2211.07275">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-shot Image Captioning by Anchor-augmented Vision-Language Space Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.07275v1-abstract-short" style="display: inline;"> CLIP (Contrastive Language-Image Pre-Training) has shown remarkable zero-shot transfer capabilities in cross-modal correlation tasks such as visual classification and image retrieval. However, its performance in cross-modal generation tasks like zero-shot image captioning remains unsatisfied. In this work, we discuss that directly employing CLIP for zero-shot image captioning relies more on the te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.07275v1-abstract-full').style.display = 'inline'; document.getElementById('2211.07275v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.07275v1-abstract-full" style="display: none;"> CLIP (Contrastive Language-Image Pre-Training) has shown remarkable zero-shot transfer capabilities in cross-modal correlation tasks such as visual classification and image retrieval. However, its performance in cross-modal generation tasks like zero-shot image captioning remains unsatisfied. In this work, we discuss that directly employing CLIP for zero-shot image captioning relies more on the textual modality in context and largely ignores the visual information, which we call \emph{contextual language prior}. To address this, we propose Cross-modal Language Models (CLMs) to facilitate unsupervised cross-modal learning. We further propose Anchor Augment to guide the generative model's attention to the fine-grained information in the representation of CLIP. Experiments on MS COCO and Flickr 30K validate the promising performance of proposed approach in both captioning quality and computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.07275v1-abstract-full').style.display = 'none'; document.getElementById('2211.07275v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05946">arXiv:2211.05946</a> <span> [<a href="https://arxiv.org/pdf/2211.05946">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/s22062256">10.3390/s22062256 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Reinforcement Learning Microgrid Optimization Strategy Considering Priority Flexible Demand Side </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jinsong Sang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongbin Sun</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+L">Lei Kou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05946v1-abstract-short" style="display: inline;"> As an efficient way to integrate multiple distributed energy resources and the user side, a microgrid is mainly faced with the problems of small-scale volatility, uncertainty, intermittency and demand-side uncertainty of DERs. The traditional microgrid has a single form and cannot meet the flexible energy dispatch between the complex demand side and the microgrid. In response to this problem, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05946v1-abstract-full').style.display = 'inline'; document.getElementById('2211.05946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05946v1-abstract-full" style="display: none;"> As an efficient way to integrate multiple distributed energy resources and the user side, a microgrid is mainly faced with the problems of small-scale volatility, uncertainty, intermittency and demand-side uncertainty of DERs. The traditional microgrid has a single form and cannot meet the flexible energy dispatch between the complex demand side and the microgrid. In response to this problem, the overall environment of wind power, thermostatically controlled loads, energy storage systems, price-responsive loads and the main grid is proposed. Secondly, the centralized control of the microgrid operation is convenient for the control of the reactive power and voltage of the distributed power supply and the adjustment of the grid frequency. However, there is a problem in that the flexible loads aggregate and generate peaks during the electricity price valley. The existing research takes into account the power constraints of the microgrid and fails to ensure a sufficient supply of electric energy for a single flexible load. This paper considers the response priority of each unit component of TCLs and ESSs on the basis of the overall environment operation of the microgrid so as to ensure the power supply of the flexible load of the microgrid and save the power input cost to the greatest extent. Finally, the simulation optimization of the environment can be expressed as a Markov decision process process. It combines two stages of offline and online operations in the training process. The addition of multiple threads with the lack of historical data learning leads to low learning efficiency. The asynchronous advantage actor-critic with the experience replay pool memory library is added to solve the data correlation and nonstatic distribution problems during training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05946v1-abstract-full').style.display = 'none'; document.getElementById('2211.05946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Sensors</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01253">arXiv:2211.01253</a> <span> [<a href="https://arxiv.org/pdf/2211.01253">pdf</a>, <a href="https://arxiv.org/format/2211.01253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Fair Visual Recognition via Intervention with Proxy Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01253v1-abstract-short" style="display: inline;"> Deep learning models often learn to make predictions that rely on sensitive social attributes like gender and race, which poses significant fairness risks, especially in societal applications, e.g., hiring, banking, and criminal justice. Existing work tackles this issue by minimizing information about social attributes in models for debiasing. However, the high correlation between target task and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01253v1-abstract-full').style.display = 'inline'; document.getElementById('2211.01253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01253v1-abstract-full" style="display: none;"> Deep learning models often learn to make predictions that rely on sensitive social attributes like gender and race, which poses significant fairness risks, especially in societal applications, e.g., hiring, banking, and criminal justice. Existing work tackles this issue by minimizing information about social attributes in models for debiasing. However, the high correlation between target task and social attributes makes bias mitigation incompatible with target task accuracy. Recalling that model bias arises because the learning of features in regard to bias attributes (i.e., bias features) helps target task optimization, we explore the following research question: \emph{Can we leverage proxy features to replace the role of bias feature in target task optimization for debiasing?} To this end, we propose \emph{Proxy Debiasing}, to first transfer the target task's learning of bias information from bias features to artificial proxy features, and then employ causal intervention to eliminate proxy features in inference. The key idea of \emph{Proxy Debiasing} is to design controllable proxy features to on one hand replace bias features in contributing to target task during the training stage, and on the other hand easily to be removed by intervention during the inference stage. This guarantees the elimination of bias features without affecting the target information, thus addressing the fairness-accuracy paradox in previous debiasing solutions. We apply \emph{Proxy Debiasing} to several benchmark datasets, and achieve significant improvements over the state-of-the-art debiasing methods in both of accuracy and fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01253v1-abstract-full').style.display = 'none'; document.getElementById('2211.01253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.14562">arXiv:2210.14562</a> <span> [<a href="https://arxiv.org/pdf/2210.14562">pdf</a>, <a href="https://arxiv.org/format/2210.14562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FairCLIP: Social Bias Elimination based on Attribute Prototype Learning and Representation Neutralization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.14562v2-abstract-short" style="display: inline;"> The Vision-Language Pre-training (VLP) models like CLIP have gained popularity in recent years. However, many works found that the social biases hidden in CLIP easily manifest in downstream tasks, especially in image retrieval, which can have harmful effects on human society. In this work, we propose FairCLIP to eliminate the social bias in CLIP-based image retrieval without damaging the retrieval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14562v2-abstract-full').style.display = 'inline'; document.getElementById('2210.14562v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.14562v2-abstract-full" style="display: none;"> The Vision-Language Pre-training (VLP) models like CLIP have gained popularity in recent years. However, many works found that the social biases hidden in CLIP easily manifest in downstream tasks, especially in image retrieval, which can have harmful effects on human society. In this work, we propose FairCLIP to eliminate the social bias in CLIP-based image retrieval without damaging the retrieval performance achieving the compatibility between the debiasing effect and the retrieval performance. FairCLIP is divided into two steps: Attribute Prototype Learning (APL) and Representation Neutralization (RN). In the first step, we extract the concepts needed for debiasing in CLIP. We use the query with learnable word vector prefixes as the extraction structure. In the second step, we first divide the attributes into target and bias attributes. By analysis, we find that both attributes have an impact on the bias. Therefore, we try to eliminate the bias by using Re-Representation Matrix (RRM) to achieve the neutralization of the representation. We compare the debiasing effect and retrieval performance with other methods, and experiments demonstrate that FairCLIP can achieve the best compatibility. Although FairCLIP is used to eliminate bias in image retrieval, it achieves the neutralization of the representation which is common to all CLIP downstream tasks. This means that FairCLIP can be applied as a general debiasing method for other fairness issues related to CLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14562v2-abstract-full').style.display = 'none'; document.getElementById('2210.14562v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.01056">arXiv:2207.01056</a> <span> [<a href="https://arxiv.org/pdf/2207.01056">pdf</a>, <a href="https://arxiv.org/format/2207.01056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Counterfactually Measuring and Eliminating Social Bias in Vision-Language Pre-training Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.01056v2-abstract-short" style="display: inline;"> Vision-Language Pre-training (VLP) models have achieved state-of-the-art performance in numerous cross-modal tasks. Since they are optimized to capture the statistical properties of intra- and inter-modality, there remains risk to learn social biases presented in the data as well. In this work, we (1) introduce a counterfactual-based bias measurement \emph{CounterBias} to quantify the social bias… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01056v2-abstract-full').style.display = 'inline'; document.getElementById('2207.01056v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.01056v2-abstract-full" style="display: none;"> Vision-Language Pre-training (VLP) models have achieved state-of-the-art performance in numerous cross-modal tasks. Since they are optimized to capture the statistical properties of intra- and inter-modality, there remains risk to learn social biases presented in the data as well. In this work, we (1) introduce a counterfactual-based bias measurement \emph{CounterBias} to quantify the social bias in VLP models by comparing the [MASK]ed prediction probabilities of factual and counterfactual samples; (2) construct a novel VL-Bias dataset including 24K image-text pairs for measuring gender bias in VLP models, from which we observed that significant gender bias is prevalent in VLP models; and (3) propose a VLP debiasing method \emph{FairVLP} to minimize the difference in the [MASK]ed prediction probabilities between factual and counterfactual image-text pairs for VLP debiasing. Although CounterBias and FairVLP focus on social bias, they are generalizable to serve as tools and provide new insights to probe and regularize more knowledge in VLP models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01056v2-abstract-full').style.display = 'none'; document.getElementById('2207.01056v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.09410">arXiv:2206.09410</a> <span> [<a href="https://arxiv.org/pdf/2206.09410">pdf</a>, <a href="https://arxiv.org/format/2206.09410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Low-Mid Adversarial Perturbation against Unauthorized Face Recognition System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Q">Qi Yi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.09410v2-abstract-short" style="display: inline;"> In light of the growing concerns regarding the unauthorized use of facial recognition systems and its implications on individual privacy, the exploration of adversarial perturbations as a potential countermeasure has gained traction. However, challenges arise in effectively deploying this approach against unauthorized facial recognition systems due to the effects of JPEG compression on image distr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09410v2-abstract-full').style.display = 'inline'; document.getElementById('2206.09410v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.09410v2-abstract-full" style="display: none;"> In light of the growing concerns regarding the unauthorized use of facial recognition systems and its implications on individual privacy, the exploration of adversarial perturbations as a potential countermeasure has gained traction. However, challenges arise in effectively deploying this approach against unauthorized facial recognition systems due to the effects of JPEG compression on image distribution across the internet, which ultimately diminishes the efficacy of adversarial perturbations. Existing JPEG compression-resistant techniques struggle to strike a balance between resistance, transferability, and attack potency. To address these limitations, we propose a novel solution referred to as \emph{low frequency adversarial perturbation} (LFAP). This method conditions the source model to leverage low-frequency characteristics through adversarial training. To further enhance the performance, we introduce an improved \emph{low-mid frequency adversarial perturbation} (LMFAP) that incorporates mid-frequency components for an additive benefit. Our study encompasses a range of settings to replicate genuine application scenarios, including cross backbones, supervisory heads, training datasets, and testing datasets. Moreover, we evaluated our approaches on a commercial black-box API, \texttt{Face++}. The empirical results validate the cutting-edge performance achieved by our proposed solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09410v2-abstract-full').style.display = 'none'; document.getElementById('2206.09410v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published in Information Sciences</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.09391">arXiv:2206.09391</a> <span> [<a href="https://arxiv.org/pdf/2206.09391">pdf</a>, <a href="https://arxiv.org/format/2206.09391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Towards Adversarial Attack on Vision-Language Pre-training Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Q">Qi Yi</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.09391v2-abstract-short" style="display: inline;"> While vision-language pre-training model (VLP) has shown revolutionary improvements on various vision-language (V+L) tasks, the studies regarding its adversarial robustness remain largely unexplored. This paper studied the adversarial attack on popular VLP models and V+L tasks. First, we analyzed the performance of adversarial attacks under different settings. By examining the influence of differe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09391v2-abstract-full').style.display = 'inline'; document.getElementById('2206.09391v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.09391v2-abstract-full" style="display: none;"> While vision-language pre-training model (VLP) has shown revolutionary improvements on various vision-language (V+L) tasks, the studies regarding its adversarial robustness remain largely unexplored. This paper studied the adversarial attack on popular VLP models and V+L tasks. First, we analyzed the performance of adversarial attacks under different settings. By examining the influence of different perturbed objects and attack targets, we concluded some key observations as guidance on both designing strong multimodal adversarial attack and constructing robust VLP models. Second, we proposed a novel multimodal attack method on the VLP models called Collaborative Multimodal Adversarial Attack (Co-Attack), which collectively carries out the attacks on the image modality and the text modality. Experimental results demonstrated that the proposed method achieves improved attack performances on different V+L downstream tasks and VLP models. The analysis observations and novel attack method hopefully provide new understanding into the adversarial robustness of VLP models, so as to contribute their safe and reliable deployment in more real-world scenarios. Code is available at https://github.com/adversarial-for-goodness/Co-Attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09391v2-abstract-full').style.display = 'none'; document.getElementById('2206.09391v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM2022. Code is available in GitHub</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Sang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Sang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Sang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository