CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 1,405 results for author: <span class="mathjax">Mao, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Mao%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Mao, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Mao%2C+X&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Mao, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17473">arXiv:2411.17473</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17473">pdf</a>, <a href="https://arxiv.org/format/2411.17473">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TinyViM: Frequency Decoupling for Tiny Hybrid Vision Mamba </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaowen Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+Z">Zhenliang Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinghao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17473v1-abstract-short" style="display: inline;"> Mamba has shown great potential for computer vision due to its linear complexity in modeling the global context with respect to the input length. However, existing lightweight Mamba-based backbones cannot demonstrate performance that matches Convolution or Transformer-based methods. We observe that simply modifying the scanning path in the image domain is not conducive to fully exploiting the pote&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17473v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17473v1-abstract-full" style="display: none;"> Mamba has shown great potential for computer vision due to its linear complexity in modeling the global context with respect to the input length. However, existing lightweight Mamba-based backbones cannot demonstrate performance that matches Convolution or Transformer-based methods. We observe that simply modifying the scanning path in the image domain is not conducive to fully exploiting the potential of vision Mamba. In this paper, we first perform comprehensive spectral and quantitative analyses, and verify that the Mamba block mainly models low-frequency information under Convolution-Mamba hybrid architecture. Based on the analyses, we introduce a novel Laplace mixer to decouple the features in terms of frequency and input only the low-frequency components into the Mamba block. In addition, considering the redundancy of the features and the different requirements for high-frequency details and low-frequency global information at different stages, we introduce a frequency ramp inception, i.e., gradually reduce the input dimensions of the high-frequency branches, so as to efficiently trade-off the high-frequency and low-frequency components at different layers. By integrating mobile-friendly convolution and efficient Laplace mixer, we build a series of tiny hybrid vision Mamba called TinyViM. The proposed TinyViM achieves impressive performance on several downstream tasks including image classification, semantic segmentation, object detection and instance segmentation. In particular, TinyViM outperforms Convolution, Transformer and Mamba-based models with similar scales, and the throughput is about 2-3 times higher than that of other Mamba-based models. Code is available at https://github.com/xwmaxwma/TinyViM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17473v1-abstract-full').style.display = 'none'; document.getElementById('2411.17473v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17230">arXiv:2411.17230</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17230">pdf</a>, <a href="https://arxiv.org/format/2411.17230">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Fault Localization from the Semantic Code Search Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Y">Yihao Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Y">Yan Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhuo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liqian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiaoguang Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17230v1-abstract-short" style="display: inline;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17230v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17230v1-abstract-full" style="display: none;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization (FL), which identifies code entities responsible for bugs within the software project to boost software debugging. These two tasks exhibit similarities since they both address search problems. Notably, CS techniques have demonstrated greater effectiveness than FL ones, possibly because of the precise semantic details of the required code offered by natural language queries, which are not readily accessible to FL methods. Drawing inspiration from this, we hypothesize that a fault localizer could achieve greater proficiency if semantic information about the buggy methods were made available. Based on this idea, we propose CosFL, an FL approach that decomposes the FL task into two steps: query generation, which describes the functionality of the problematic code in natural language, and fault retrieval, which uses CS to find program elements semantically related to the query. Specifically, to depict the buggy functionalities and generate high-quality queries, CosFL extensively harnesses the code analysis, semantic comprehension, and decision-making capabilities of LLMs. Moreover, to enhance the accuracy of CS, CosFL captures varying levels of context information and employs a multi-granularity code search strategy, which facilitates a more precise identification of buggy methods from a holistic view. The evaluation on 835 real bugs from 23 Java projects shows that CosFL successfully localizes 324 bugs within Top-1, which significantly outperforms the state-of-the-art approaches by 26.6%-57.3%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'none'; document.getElementById('2411.17230v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17180">arXiv:2411.17180</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17180">pdf</a>, <a href="https://arxiv.org/format/2411.17180">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training a neural netwok for data reduction and better generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sardy%2C+S">Sylvain Sardy</a>, <a href="/search/cs?searchtype=author&amp;query=van+Cutsem%2C+M">Maxime van Cutsem</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoyu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17180v1-abstract-short" style="display: inline;"> The motivation for sparse learners is to compress the inputs (features) by selecting only the ones needed for good generalization. Linear models with LASSO-type regularization achieve this by setting the weights of irrelevant features to zero, effectively identifying and ignoring them. In artificial neural networks, this selective focus can be achieved by pruning the input layer. Given a cost func&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17180v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17180v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17180v1-abstract-full" style="display: none;"> The motivation for sparse learners is to compress the inputs (features) by selecting only the ones needed for good generalization. Linear models with LASSO-type regularization achieve this by setting the weights of irrelevant features to zero, effectively identifying and ignoring them. In artificial neural networks, this selective focus can be achieved by pruning the input layer. Given a cost function enhanced with a sparsity-promoting penalty, our proposal selects a regularization term $位$ (without the use of cross-validation or a validation set) that creates a local minimum in the cost function at the origin where no features are selected. This local minimum acts as a baseline, meaning that if there is no strong enough signal to justify a feature inclusion, the local minimum remains at zero with a high prescribed probability. The method is flexible, applying to complex models ranging from shallow to deep artificial neural networks and supporting various cost functions and sparsity-promoting penalties. We empirically show a remarkable phase transition in the probability of retrieving the relevant features, as well as good generalization thanks to the choice of $位$, the non-convex penalty and the optimization scheme developed. This approach can be seen as a form of compressed sensing for complex models, allowing us to distill high-dimensional data into a compact, interpretable subset of meaningful features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17180v1-abstract-full').style.display = 'none'; document.getElementById('2411.17180v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16365">arXiv:2411.16365</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16365">pdf</a>, <a href="https://arxiv.org/format/2411.16365">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Retrieval Augmented Multi-modal Generation: A Benchmark, Evaluate Metrics and Strong Baselines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zi-Ao Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+R">Rong-Cheng Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heyan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xian-Ling Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16365v1-abstract-short" style="display: inline;"> This paper investigates an intriguing task of Multi-modal Retrieval Augmented Multi-modal Generation (M$^2$RAG). This task requires foundation models to browse multi-modal web pages, with mixed text and images, and generate multi-modal responses for solving user queries, which exhibits better information density and readability. Given the early researching stage of M$^2$RAG task, there is a lack o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16365v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16365v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16365v1-abstract-full" style="display: none;"> This paper investigates an intriguing task of Multi-modal Retrieval Augmented Multi-modal Generation (M$^2$RAG). This task requires foundation models to browse multi-modal web pages, with mixed text and images, and generate multi-modal responses for solving user queries, which exhibits better information density and readability. Given the early researching stage of M$^2$RAG task, there is a lack of systematic studies and analysis. To fill this gap, we construct a benchmark for M$^2$RAG task, equipped with a suite of text-modal metrics and multi-modal metrics to analyze the capabilities of existing foundation models. Besides, we also propose several effective methods for foundation models to accomplish this task, based on the comprehensive evaluation results on our benchmark. Extensive experimental results reveal several intriguing phenomena worth further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16365v1-abstract-full').style.display = 'none'; document.getElementById('2411.16365v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15871">arXiv:2411.15871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15871">pdf</a>, <a href="https://arxiv.org/format/2411.15871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Hiding Communication Cost in Distributed LLM Training via Micro-batch Co-execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haiquan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+C">Chaoyi Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jia He</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+J">Jiaqi Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chengjie Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaosong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15871v1-abstract-short" style="display: inline;"> The growth of Large Language Models (LLMs) has necessitated large-scale distributed training. Highly optimized frameworks, however, still suffer significant losses in Model FLOPS utilization (often below 50%) due to large communication volumes. Meanwhile, our comprehensive profiling shows that the computation- and communication-intensive operators overlap well. This paper introduces DHelix, a no&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15871v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15871v1-abstract-full" style="display: none;"> The growth of Large Language Models (LLMs) has necessitated large-scale distributed training. Highly optimized frameworks, however, still suffer significant losses in Model FLOPS utilization (often below 50%) due to large communication volumes. Meanwhile, our comprehensive profiling shows that the computation- and communication-intensive operators overlap well. This paper introduces DHelix, a novel micro-structure that dramatically improves the efficiency of LLM training inspired by the DNA structure. Central to DHelix&#39;s design is Strand Interleaving (SI), which views the continuous stream of training micro-batches through a GPU as two strands. DHelix juxtaposes the forward and backward passes of the two strands and performs a systematic optimization for an SI plan that co-schedules the operators from the opposite strands, enabled by operator-level overlap profiling results and a dynamic-programming based search algorithm. Meanwhile, DHelix enables the two strands to share model states and space for activation data, effectively accommodating two micro-batches with under 3% extra memory space. Dhelix seamlessly integrates with all forms of existing data/model parallelism, the most challenging being pipeline parallelism, thanks to its unique model folding design that results in a W-shaped pipeline. We evaluate DHelix training with the popular Llama and GPT dense models, plus the Phi Mixture of Expert (MoE) model, across 3 GPU clusters (A40, A800, and H100). Results show that it achieves 12-40% (up to 58% MFU) and 2-29% (up to 71% MFU) improvement on the 64-A40 and 64-A800 clusters, respectively, significantly outperforming state-of-the-art methods. On the H100 cluster, though the faster network reduces DHelix&#39;s profit margin, it makes cross-node tensor parallelism promising, a practice currently prohibitive due to communication costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15871v1-abstract-full').style.display = 'none'; document.getElementById('2411.15871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15770">arXiv:2411.15770</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15770">pdf</a>, <a href="https://arxiv.org/format/2411.15770">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-Guided Coarse-to-Fine Fusion Network for Robust Remote Sensing Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Changfu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoliang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15770v1-abstract-short" style="display: inline;"> Remote Sensing Visual Question Answering (RSVQA) has gained significant research interest. However, current RSVQA methods are limited by the imaging mechanisms of optical sensors, particularly under challenging conditions such as cloud-covered and low-light scenarios. Given the all-time and all-weather imaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to investigate the integra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15770v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15770v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15770v1-abstract-full" style="display: none;"> Remote Sensing Visual Question Answering (RSVQA) has gained significant research interest. However, current RSVQA methods are limited by the imaging mechanisms of optical sensors, particularly under challenging conditions such as cloud-covered and low-light scenarios. Given the all-time and all-weather imaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to investigate the integration of optical-SAR images to improve RSVQA performance. In this work, we propose a Text-guided Coarse-to-Fine Fusion Network (TGFNet), which leverages the semantic relationships between question text and multi-source images to guide the network toward complementary fusion at the feature level. Specifically, we develop a Text-guided Coarse-to-Fine Attention Refinement (CFAR) module to focus on key areas related to the question in complex remote sensing images. This module progressively directs attention from broad areas to finer details through key region routing, enhancing the model&#39;s ability to focus on relevant regions. Furthermore, we propose an Adaptive Multi-Expert Fusion (AMEF) module that dynamically integrates different experts, enabling the adaptive fusion of optical and SAR features. In addition, we create the first large-scale benchmark dataset for evaluating optical-SAR RSVQA methods, comprising 6,008 well-aligned optical-SAR image pairs and 1,036,694 well-labeled question-answer pairs across 16 diverse question types, including complex relational reasoning questions. Extensive experiments on the proposed dataset demonstrate that our TGFNet effectively integrates complementary information between optical and SAR images, significantly improving the model&#39;s performance in challenging scenarios. The dataset is available at: https://github.com/mmic-lcl/. Index Terms: Remote Sensing Visual Question Answering, Multi-source Data Fusion, Multimodal, Remote Sensing, OPT-SAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15770v1-abstract-full').style.display = 'none'; document.getElementById('2411.15770v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15501">arXiv:2411.15501</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15501">pdf</a>, <a href="https://arxiv.org/format/2411.15501">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Instruct or Interact? Exploring and Eliciting LLMs&#39; Capability in Code Snippet Adaptation Through Prompt Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tanghaoran Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xinjun Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+K">Kang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yuxin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15501v1-abstract-short" style="display: inline;"> Code snippet adaptation is a fundamental activity in the software development process. Unlike code generation, code snippet adaptation is not a &#34;free creation&#34;, which requires developers to tailor a given code snippet in order to fit specific requirements and the code context. Recently, large language models (LLMs) have confirmed their effectiveness in the code generation task with promising resul&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15501v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15501v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15501v1-abstract-full" style="display: none;"> Code snippet adaptation is a fundamental activity in the software development process. Unlike code generation, code snippet adaptation is not a &#34;free creation&#34;, which requires developers to tailor a given code snippet in order to fit specific requirements and the code context. Recently, large language models (LLMs) have confirmed their effectiveness in the code generation task with promising results. However, their performance on adaptation, a reuse-oriented and context-dependent code change prediction task, is still unclear. To bridge this gap, we conduct an empirical study to investigate the performance and issues of LLMs on the adaptation task. We first evaluate the adaptation performances of three popular LLMs and compare them to the code generation task. Our result indicates that their adaptation ability is weaker than generation, with a nearly 15% decrease on pass@1 and more context-related errors. By manually inspecting 200 cases, we further investigate the causes of LLMs&#39; sub-optimal performance, which can be classified into three categories, i.e., Unclear Requirement, Requirement Misalignment and Context Misapplication. Based on the above empirical research, we propose an interactive prompting approach to eliciting LLMs&#39; adaptation ability. Experimental result reveals that our approach greatly improve LLMs&#39; adaptation performance. The best-performing Human-LLM interaction successfully solves 159 out of the 202 identified defects and improves the pass@1 and pass@5 by over 40% compared to the initial instruction-based prompt. Considering human efforts, we suggest multi-agent interaction as a trade-off, which can achieve comparable performance with excellent generalization ability. We deem that our approach could provide methodological assistance for autonomous code snippet reuse and adaptation with LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15501v1-abstract-full').style.display = 'none'; document.getElementById('2411.15501v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, accepted by ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15488">arXiv:2411.15488</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15488">pdf</a>, <a href="https://arxiv.org/format/2411.15488">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic Evaluation for Text-to-image Generation: Task-decomposed Framework, Distilled Training, and Meta-evaluation Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tu%2C+R">Rong-Cheng Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zi-Ao Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yuehao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heyan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xian-Ling Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15488v1-abstract-short" style="display: inline;"> Driven by the remarkable progress in diffusion models, text-to-image generation has made significant strides, creating a pressing demand for automatic quality evaluation of generated images. Current state-of-the-art automatic evaluation methods heavily rely on Multi-modal Large Language Models (MLLMs), particularly powerful commercial models like GPT-4o. While these models are highly effective, th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15488v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15488v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15488v1-abstract-full" style="display: none;"> Driven by the remarkable progress in diffusion models, text-to-image generation has made significant strides, creating a pressing demand for automatic quality evaluation of generated images. Current state-of-the-art automatic evaluation methods heavily rely on Multi-modal Large Language Models (MLLMs), particularly powerful commercial models like GPT-4o. While these models are highly effective, their substantial costs limit scalability in large-scale evaluations. Adopting open-source MLLMs is an alternative; however, their performance falls short due to significant limitations in processing multi-modal data compared to commercial MLLMs. To tackle these problems, we first propose a task decomposition evaluation framework based on GPT-4o to automatically construct a new training dataset, where the complex evaluation task is decoupled into simpler sub-tasks, effectively reducing the learning complexity. Based on this dataset, we design innovative training strategies to effectively distill GPT-4o&#39;s evaluation capabilities into a 7B open-source MLLM, MiniCPM-V-2.6. Furthermore, to reliably and comprehensively assess prior works and our proposed model, we manually annotate a meta-evaluation benchmark that includes chain-of-thought explanations alongside quality scores for generated images. Experimental results demonstrate that our distilled open-source MLLM significantly outperforms the current state-of-the-art GPT-4o-base baseline, VIEScore, with over 4.6\% improvement in Spearman and Kendall correlations with human judgments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15488v1-abstract-full').style.display = 'none'; document.getElementById('2411.15488v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15244">arXiv:2411.15244</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15244">pdf</a>, <a href="https://arxiv.org/format/2411.15244">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Prompt Distillation for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lin Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zi%2C+B">Bojia Zi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shihao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15244v1-abstract-short" style="display: inline;"> Large pre-trained Vision-Language Models (VLMs) such as Contrastive Language-Image Pre-Training (CLIP) have been shown to be susceptible to adversarial attacks, raising concerns about their deployment in safety-critical scenarios like autonomous driving and medical diagnosis. One promising approach for improving the robustness of pre-trained VLMs is Adversarial Prompt Tuning (APT), which combines&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15244v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15244v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15244v1-abstract-full" style="display: none;"> Large pre-trained Vision-Language Models (VLMs) such as Contrastive Language-Image Pre-Training (CLIP) have been shown to be susceptible to adversarial attacks, raising concerns about their deployment in safety-critical scenarios like autonomous driving and medical diagnosis. One promising approach for improving the robustness of pre-trained VLMs is Adversarial Prompt Tuning (APT), which combines adversarial training with prompt tuning. However, existing APT methods are mostly single-modal methods that design prompt(s) for only the visual or textual modality, limiting their effectiveness in either robustness or clean accuracy. In this work, we propose a novel method called Adversarial Prompt Distillation (APD) that combines APT with knowledge distillation to boost the adversarial robustness of CLIP. Specifically, APD is a bimodal method that adds prompts for both the visual and textual modalities while leveraging a cleanly pre-trained teacher CLIP model to distill and boost the performance of the student CLIP model on downstream tasks. Extensive experiments on multiple benchmark datasets demonstrate the superiority of our APD over the current state-of-the-art APT methods in terms of both natural and adversarial performances. The effectiveness of our APD method validates the possibility of using a non-robust teacher to improve the generalization and robustness of VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15244v1-abstract-full').style.display = 'none'; document.getElementById('2411.15244v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15210">arXiv:2411.15210</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15210">pdf</a>, <a href="https://arxiv.org/format/2411.15210">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Million-Scale Adversarial Robustness Evaluation With Stronger Individual Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Weijie Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Hanxun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+G">Guangnan Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15210v1-abstract-short" style="display: inline;"> As deep learning models are increasingly deployed in safety-critical applications, evaluating their vulnerabilities to adversarial perturbations is essential for ensuring their reliability and trustworthiness. Over the past decade, a large number of white-box adversarial robustness evaluation methods (i.e., attacks) have been proposed, ranging from single-step to multi-step methods and from indivi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15210v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15210v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15210v1-abstract-full" style="display: none;"> As deep learning models are increasingly deployed in safety-critical applications, evaluating their vulnerabilities to adversarial perturbations is essential for ensuring their reliability and trustworthiness. Over the past decade, a large number of white-box adversarial robustness evaluation methods (i.e., attacks) have been proposed, ranging from single-step to multi-step methods and from individual to ensemble methods. Despite these advances, challenges remain in conducting meaningful and comprehensive robustness evaluations, particularly when it comes to large-scale testing and ensuring evaluations reflect real-world adversarial risks. In this work, we focus on image classification models and propose a novel individual attack method, Probability Margin Attack (PMA), which defines the adversarial margin in the probability space rather than the logits space. We analyze the relationship between PMA and existing cross-entropy or logits-margin-based attacks, and show that PMA can outperform the current state-of-the-art individual methods. Building on PMA, we propose two types of ensemble attacks that balance effectiveness and efficiency. Furthermore, we create a million-scale dataset, CC1M, derived from the existing CC3M dataset, and use it to conduct the first million-scale white-box adversarial robustness evaluation of adversarially-trained ImageNet models. Our findings provide valuable insights into the robustness gaps between individual versus ensemble attacks and small-scale versus million-scale evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15210v1-abstract-full').style.display = 'none'; document.getElementById('2411.15210v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13477">arXiv:2411.13477</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13477">pdf</a>, <a href="https://arxiv.org/format/2411.13477">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PatentEdits: Framing Patent Novelty as Textual Entailment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+R">Ryan Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Spangher%2C+A">Alexander Spangher</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xuezhe Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13477v1-abstract-short" style="display: inline;"> A patent must be deemed novel and non-obvious in order to be granted by the US Patent Office (USPTO). If it is not, a US patent examiner will cite the prior work, or prior art, that invalidates the novelty and issue a non-final rejection. Predicting what claims of the invention should change given the prior art is an essential and crucial step in securing invention rights, yet has not been studied&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13477v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13477v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13477v1-abstract-full" style="display: none;"> A patent must be deemed novel and non-obvious in order to be granted by the US Patent Office (USPTO). If it is not, a US patent examiner will cite the prior work, or prior art, that invalidates the novelty and issue a non-final rejection. Predicting what claims of the invention should change given the prior art is an essential and crucial step in securing invention rights, yet has not been studied before as a learnable task. In this work we introduce the PatentEdits dataset, which contains 105K examples of successful revisions that overcome objections to novelty. We design algorithms to label edits sentence by sentence, then establish how well these edits can be predicted with large language models (LLMs). We demonstrate that evaluating textual entailment between cited references and draft sentences is especially effective in predicting which inventive claims remained unchanged or are novel in relation to prior art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13477v1-abstract-full').style.display = 'none'; document.getElementById('2411.13477v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13136">arXiv:2411.13136</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13136">pdf</a>, <a href="https://arxiv.org/format/2411.13136">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TAPT: Test-Time Adversarial Prompt Tuning for Robust Inference in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13136v1-abstract-short" style="display: inline;"> Large pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated excellent zero-shot generalizability across various downstream tasks. However, recent studies have shown that the inference performance of CLIP can be greatly degraded by small adversarial perturbations, especially its visual modality, posing significant safety threats. To mitigate this vulnerability, in this paper, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13136v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13136v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13136v1-abstract-full" style="display: none;"> Large pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated excellent zero-shot generalizability across various downstream tasks. However, recent studies have shown that the inference performance of CLIP can be greatly degraded by small adversarial perturbations, especially its visual modality, posing significant safety threats. To mitigate this vulnerability, in this paper, we propose a novel defense method called Test-Time Adversarial Prompt Tuning (TAPT) to enhance the inference robustness of CLIP against visual adversarial attacks. TAPT is a test-time defense method that learns defensive bimodal (textual and visual) prompts to robustify the inference process of CLIP. Specifically, it is an unsupervised method that optimizes the defensive prompts for each test sample by minimizing a multi-view entropy and aligning adversarial-clean distributions. We evaluate the effectiveness of TAPT on 11 benchmark datasets, including ImageNet and 10 other zero-shot datasets, demonstrating that it enhances the zero-shot adversarial robustness of the original CLIP by at least 48.9% against AutoAttack (AA), while largely maintaining performance on clean examples. Moreover, TAPT outperforms existing adversarial prompt tuning methods across various backbones, achieving an average robustness improvement of at least 36.6%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13136v1-abstract-full').style.display = 'none'; document.getElementById('2411.13136v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10996">arXiv:2411.10996</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10996">pdf</a>, <a href="https://arxiv.org/ps/2411.10996">ps</a>, <a href="https://arxiv.org/format/2411.10996">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Gadgetless Lifting Beats Round Elimination: Improved Lower Bounds for Pointer Chasing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xinyu Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+G">Guangxu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiapeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10996v1-abstract-short" style="display: inline;"> We prove an 惟(n/k+k) communication lower bound on (k-1)-round distributional complexity of the k-step pointer chasing problem under uniform input distribution, improving the 惟(n/k - k log n) lower bound due to Yehudayoff (Combinatorics Probability and Computing, 2020). Our lower bound almost matches the upper bound of O(n/k + k) communication by Nisan and Wigderson (STOC 91). As part of our appr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10996v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10996v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10996v1-abstract-full" style="display: none;"> We prove an 惟(n/k+k) communication lower bound on (k-1)-round distributional complexity of the k-step pointer chasing problem under uniform input distribution, improving the 惟(n/k - k log n) lower bound due to Yehudayoff (Combinatorics Probability and Computing, 2020). Our lower bound almost matches the upper bound of O(n/k + k) communication by Nisan and Wigderson (STOC 91). As part of our approach, we put forth gadgetless lifting, a new framework that lifts lower bounds for a family of restricted protocols into lower bounds for general protocols. A key step in gadgetless lifting is choosing the appropriate definition of restricted protocols. In this paper, our definition of restricted protocols is inspired by the structure-vs-pseudorandomness decomposition by G枚枚s, Pitassi, and Watson (FOCS 17) and Yang and Zhang (STOC 24). Previously, round-communication trade-offs were mainly obtained by round elimination and information complexity. Both methods have some barriers in some situations, and we believe gadgetless lifting could potentially address these barriers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10996v1-abstract-full').style.display = 'none'; document.getElementById('2411.10996v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07863">arXiv:2411.07863</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07863">pdf</a>, <a href="https://arxiv.org/format/2411.07863">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CDXFormer: Boosting Remote Sensing Change Detection with Extended Long Short-Term Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhenkai Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaowen Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+R">Rongrong Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhentao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07863v1-abstract-short" style="display: inline;"> In complex scenes and varied conditions, effectively integrating spatial-temporal context is crucial for accurately identifying changes. However, current RS-CD methods lack a balanced consideration of performance and efficiency. CNNs lack global context, Transformers have quadratic computational complexity, and Mambas are restricted by CUDA acceleration. In this paper, we propose CDXFormer, with a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07863v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07863v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07863v1-abstract-full" style="display: none;"> In complex scenes and varied conditions, effectively integrating spatial-temporal context is crucial for accurately identifying changes. However, current RS-CD methods lack a balanced consideration of performance and efficiency. CNNs lack global context, Transformers have quadratic computational complexity, and Mambas are restricted by CUDA acceleration. In this paper, we propose CDXFormer, with a core component that is a powerful XLSTM-based feature enhancement layer, integrating the advantages of linear computational complexity, global context perception, and strong interpret-ability. Specifically, we introduce a scale-specific Feature Enhancer layer, incorporating a Cross-Temporal Global Perceptron customized for semantic-accurate deep features, and a Cross-Temporal Spatial Refiner customized for detail-rich shallow features. Additionally, we propose a Cross-Scale Interactive Fusion module to progressively interact global change representations with spatial responses. Extensive experimental results demonstrate that CDXFormer achieves state-of-the-art performance across three benchmark datasets, offering a compelling balance between efficiency and accuracy. Code is available at https://github.com/xwmaxwma/rschange. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07863v1-abstract-full').style.display = 'none'; document.getElementById('2411.07863v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07457">arXiv:2411.07457</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07457">pdf</a>, <a href="https://arxiv.org/format/2411.07457">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DecoPrompt : Decoding Prompts Reduces Hallucinations when Large Language Models Meet False Premises </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xuezhe Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07457v1-abstract-short" style="display: inline;"> While large language models (LLMs) have demonstrated increasing power, they have also called upon studies on their hallucinated outputs that deviate from factually correct statements. In this paper, we focus on one important scenario of false premises, where LLMs are distracted by misaligned claims although the model possesses the required factual knowledge to answer original questions accurately.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07457v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07457v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07457v1-abstract-full" style="display: none;"> While large language models (LLMs) have demonstrated increasing power, they have also called upon studies on their hallucinated outputs that deviate from factually correct statements. In this paper, we focus on one important scenario of false premises, where LLMs are distracted by misaligned claims although the model possesses the required factual knowledge to answer original questions accurately. Inspired by the observation that entropy of the false-premise prompt is closely related to its likelihood to elicit hallucination generation, we propose a new prompting algorithm, named DecoPrompt, to mitigate hallucination. DecoPrompt leverages LLMs to &#34;decode&#34; the false-premise prompts without really eliciting hallucination output from LLMs. We perform experiments on two datasets, demonstrating that DecoPrompt can reduce hallucinations effectively on outputs from different LLMs. Moreover, DecoPrompt exhibits cross-model transferability, which facilitates its applications to scenarios such as LLMs of large sizes or unavailable model logits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07457v1-abstract-full').style.display = 'none'; document.getElementById('2411.07457v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05088">arXiv:2411.05088</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05088">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Findings of the IWSLT 2024 Evaluation Campaign </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ahmad%2C+I+S">Ibrahim Said Ahmad</a>, <a href="/search/cs?searchtype=author&amp;query=Anastasopoulos%2C+A">Antonios Anastasopoulos</a>, <a href="/search/cs?searchtype=author&amp;query=Bojar%2C+O">Ond艡ej Bojar</a>, <a href="/search/cs?searchtype=author&amp;query=Borg%2C+C">Claudia Borg</a>, <a href="/search/cs?searchtype=author&amp;query=Carpuat%2C+M">Marine Carpuat</a>, <a href="/search/cs?searchtype=author&amp;query=Cattoni%2C+R">Roldano Cattoni</a>, <a href="/search/cs?searchtype=author&amp;query=Cettolo%2C+M">Mauro Cettolo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Federico%2C+M">Marcello Federico</a>, <a href="/search/cs?searchtype=author&amp;query=Haddow%2C+B">Barry Haddow</a>, <a href="/search/cs?searchtype=author&amp;query=Javorsk%C3%BD%2C+D">D谩vid Javorsk媒</a>, <a href="/search/cs?searchtype=author&amp;query=Krubi%C5%84ski%2C+M">Mateusz Krubi艅ski</a>, <a href="/search/cs?searchtype=author&amp;query=Lam%2C+T+K">Tsz Kin Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xutai Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Mathur%2C+P">Prashant Mathur</a>, <a href="/search/cs?searchtype=author&amp;query=Matusov%2C+E">Evgeny Matusov</a>, <a href="/search/cs?searchtype=author&amp;query=Maurya%2C+C">Chandresh Maurya</a>, <a href="/search/cs?searchtype=author&amp;query=McCrae%2C+J">John McCrae</a>, <a href="/search/cs?searchtype=author&amp;query=Murray%2C+K">Kenton Murray</a>, <a href="/search/cs?searchtype=author&amp;query=Nakamura%2C+S">Satoshi Nakamura</a>, <a href="/search/cs?searchtype=author&amp;query=Negri%2C+M">Matteo Negri</a>, <a href="/search/cs?searchtype=author&amp;query=Niehues%2C+J">Jan Niehues</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+X">Xing Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Ojha%2C+A+K">Atul Kr. Ojha</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05088v1-abstract-short" style="display: inline;"> This paper reports on the shared tasks organized by the 21st IWSLT Conference. The shared tasks address 7 scientific challenges in spoken language translation: simultaneous and offline translation, automatic subtitling and dubbing, speech-to-speech translation, dialect and low-resource speech translation, and Indic languages. The shared tasks attracted 18 teams whose submissions are documented in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05088v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05088v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05088v1-abstract-full" style="display: none;"> This paper reports on the shared tasks organized by the 21st IWSLT Conference. The shared tasks address 7 scientific challenges in spoken language translation: simultaneous and offline translation, automatic subtitling and dubbing, speech-to-speech translation, dialect and low-resource speech translation, and Indic languages. The shared tasks attracted 18 teams whose submissions are documented in 26 system papers. The growing interest towards spoken language translation is also witnessed by the constantly increasing number of shared task organizers and contributors to the overview paper, almost evenly distributed across industry and academia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05088v1-abstract-full').style.display = 'none'; document.getElementById('2411.05088v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IWSLT 2024; 59 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03964">arXiv:2411.03964</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03964">pdf</a>, <a href="https://arxiv.org/format/2411.03964">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> What Really is Commonsense Knowledge? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Do%2C+Q+V">Quyet V. Do</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junze Li</a>, <a href="/search/cs?searchtype=author&amp;query=Vuong%2C+T">Tung-Duong Vuong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhaowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yangqiu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaojuan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03964v1-abstract-short" style="display: inline;"> Commonsense datasets have been well developed in Natural Language Processing, mainly through crowdsource human annotation. However, there are debates on the genuineness of commonsense reasoning benchmarks. In specific, a significant portion of instances in some commonsense benchmarks do not concern commonsense knowledge. That problem would undermine the measurement of the true commonsense reasonin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03964v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03964v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03964v1-abstract-full" style="display: none;"> Commonsense datasets have been well developed in Natural Language Processing, mainly through crowdsource human annotation. However, there are debates on the genuineness of commonsense reasoning benchmarks. In specific, a significant portion of instances in some commonsense benchmarks do not concern commonsense knowledge. That problem would undermine the measurement of the true commonsense reasoning ability of evaluated models. It is also suggested that the problem originated from a blurry concept of commonsense knowledge, as distinguished from other types of knowledge. To demystify all of the above claims, in this study, we survey existing definitions of commonsense knowledge, ground into the three frameworks for defining concepts, and consolidate them into a multi-framework unified definition of commonsense knowledge (so-called consolidated definition). We then use the consolidated definition for annotations and experiments on the CommonsenseQA and CommonsenseQA 2.0 datasets to examine the above claims. Our study shows that there exists a large portion of non-commonsense-knowledge instances in the two datasets, and a large performance gap on these two subsets where Large Language Models (LLMs) perform worse on commonsense-knowledge instances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03964v1-abstract-full').style.display = 'none'; document.getElementById('2411.03964v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and data will be released together with the next version of the paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02448">arXiv:2411.02448</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02448">pdf</a>, <a href="https://arxiv.org/format/2411.02448">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rate, Explain and Cite (REC): Enhanced Explanation and Attribution in Automatic Evaluation by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+A+R">Aliyah R. Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">James Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+B">Bin Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Mehrotra%2C+S">Shubham Mehrotra</a>, <a href="/search/cs?searchtype=author&amp;query=Pentyala%2C+S+K">Shiva K. Pentyala</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+K">Katherine Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiang-Bo Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Omrani%2C+R">Roshanak Omrani</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhuri%2C+S">Sougata Chaudhuri</a>, <a href="/search/cs?searchtype=author&amp;query=Radhakrishnan%2C+R">Regunathan Radhakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Asur%2C+S">Sitaram Asur</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+C+N">Claire Na Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02448v1-abstract-short" style="display: inline;"> LLMs have demonstrated impressive proficiency in generating coherent and high-quality text, making them valuable across a range of text-generation tasks. However, rigorous evaluation of this generated content is crucial, as ensuring its quality remains a significant challenge due to persistent issues such as factual inaccuracies and hallucinations. This paper introduces two fine-tuned general-purp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02448v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02448v1-abstract-full" style="display: none;"> LLMs have demonstrated impressive proficiency in generating coherent and high-quality text, making them valuable across a range of text-generation tasks. However, rigorous evaluation of this generated content is crucial, as ensuring its quality remains a significant challenge due to persistent issues such as factual inaccuracies and hallucinations. This paper introduces two fine-tuned general-purpose LLM autoevaluators, REC-12B and REC-70B, specifically designed to evaluate generated text across several dimensions: faithfulness, instruction following, coherence, and completeness. These models not only provide ratings for these metrics but also offer detailed explanations and verifiable citations, thereby enhancing trust in the content. Moreover, the models support various citation modes, accommodating different requirements for latency and granularity. Extensive evaluations on diverse benchmarks demonstrate that our general-purpose LLM auto-evaluator, REC-70B, outperforms state-of-the-art LLMs, excelling in content evaluation by delivering better quality explanations and citations with minimal bias. It achieves Rank \#1 as a generative model on the RewardBench leaderboard\footnote{\url{https://huggingface.co/spaces/allenai/reward-bench}} under the model name \texttt{TextEval-Llama3.1-70B}. Our REC dataset and models are released at \url{https://github.com/adelaidehsu/REC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02448v1-abstract-full').style.display = 'none'; document.getElementById('2411.02448v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01850">arXiv:2411.01850</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01850">pdf</a>, <a href="https://arxiv.org/format/2411.01850">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ManiBox: Enhancing Spatial Grasping Generalization via Scalable Simulation Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+H">Hengkai Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xuezhou Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ying%2C+C">Chengyang Ying</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xinyi Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Songming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xingxing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01850v1-abstract-short" style="display: inline;"> Learning a precise robotic grasping policy is crucial for embodied agents operating in complex real-world manipulation tasks. Despite significant advancements, most models still struggle with accurate spatial positioning of objects to be grasped. We first show that this spatial generalization challenge stems primarily from the extensive data requirements for adequate spatial understanding. However&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01850v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01850v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01850v1-abstract-full" style="display: none;"> Learning a precise robotic grasping policy is crucial for embodied agents operating in complex real-world manipulation tasks. Despite significant advancements, most models still struggle with accurate spatial positioning of objects to be grasped. We first show that this spatial generalization challenge stems primarily from the extensive data requirements for adequate spatial understanding. However, collecting such data with real robots is prohibitively expensive, and relying on simulation data often leads to visual generalization gaps upon deployment. To overcome these challenges, we then focus on state-based policy generalization and present \textbf{ManiBox}, a novel bounding-box-guided manipulation method built on a simulation-based teacher-student framework. The teacher policy efficiently generates scalable simulation data using bounding boxes, which are proven to uniquely determine the objects&#39; spatial positions. The student policy then utilizes these low-dimensional spatial states to enable zero-shot transfer to real robots. Through comprehensive evaluations in simulated and real-world environments, ManiBox demonstrates a marked improvement in spatial grasping generalization and adaptability to diverse objects and backgrounds. Further, our empirical study into scaling laws for policy performance indicates that spatial volume generalization scales positively with data volume. For a certain level of spatial volume, the success rate of grasping empirically follows Michaelis-Menten kinetics relative to data volume, showing a saturation effect as data increases. Our videos and code are available in https://thkkk.github.io/manibox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01850v1-abstract-full').style.display = 'none'; document.getElementById('2411.01850v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01353">arXiv:2411.01353</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01353">pdf</a>, <a href="https://arxiv.org/format/2411.01353">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Model Predict Employee Attrition? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoye Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Weiheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Changyi Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tukhvatulina%2C+L+R">Liliya R. Tukhvatulina</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01353v1-abstract-short" style="display: inline;"> Employee attrition poses significant costs for organizations, with traditional statistical prediction methods often struggling to capture modern workforce complexities. Machine learning (ML) advancements offer more scalable and accurate solutions, but large language models (LLMs) introduce new potential in human resource management by interpreting nuanced employee communication and detecting subtl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01353v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01353v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01353v1-abstract-full" style="display: none;"> Employee attrition poses significant costs for organizations, with traditional statistical prediction methods often struggling to capture modern workforce complexities. Machine learning (ML) advancements offer more scalable and accurate solutions, but large language models (LLMs) introduce new potential in human resource management by interpreting nuanced employee communication and detecting subtle turnover cues. This study leverages the IBM HR Analytics Attrition dataset to compare the predictive accuracy and interpretability of a fine-tuned GPT-3.5 model against traditional ML classifiers, including Logistic Regression, k-Nearest Neighbors (KNN), Support Vector Machine (SVM), Decision Tree, Random Forest, AdaBoost, and XGBoost. While traditional models are easier to use and interpret, LLMs can reveal deeper patterns in employee behavior. Our findings show that the fine-tuned GPT-3.5 model outperforms traditional methods with a precision of 0.91, recall of 0.94, and an F1-score of 0.92, while the best traditional model, SVM, achieved an F1-score of 0.82, with Random Forest and XGBoost reaching 0.80. These results highlight GPT-3.5&#39;s ability to capture complex patterns in attrition risk, offering organizations improved insights for retention strategies and underscoring the value of LLMs in HR applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01353v1-abstract-full').style.display = 'none'; document.getElementById('2411.01353v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00827">arXiv:2411.00827</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00827">pdf</a>, <a href="https://arxiv.org/format/2411.00827">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IDEATOR: Jailbreaking Large Vision-Language Models Using Themselves </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruofan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaosen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00827v2-abstract-short" style="display: inline;"> As large Vision-Language Models (VLMs) grow in prominence, ensuring their safe deployment has become critical. Recent studies have explored VLM robustness against jailbreak attacks--techniques that exploit model vulnerabilities to elicit harmful outputs. However, the limited availability of diverse multi-modal data has led current approaches to rely heavily on adversarial or manually crafted image&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00827v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00827v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00827v2-abstract-full" style="display: none;"> As large Vision-Language Models (VLMs) grow in prominence, ensuring their safe deployment has become critical. Recent studies have explored VLM robustness against jailbreak attacks--techniques that exploit model vulnerabilities to elicit harmful outputs. However, the limited availability of diverse multi-modal data has led current approaches to rely heavily on adversarial or manually crafted images derived from harmful text datasets, which may lack effectiveness and diversity across different contexts. In this paper, we propose a novel jailbreak method named IDEATOR, which autonomously generates malicious image-text pairs for black-box jailbreak attacks. IDEATOR is based on the insight that VLMs themselves could serve as powerful red team models for generating multimodal jailbreak prompts. Specifically, IDEATOR uses a VLM to create targeted jailbreak texts and pairs them with jailbreak images generated by a state-of-the-art diffusion model. Our extensive experiments demonstrate IDEATOR&#39;s high effectiveness and transferability. Notably, it achieves a 94% success rate in jailbreaking MiniGPT-4 with an average of only 5.34 queries, and high success rates of 82%, 88%, and 75% when transferred to LLaVA, InstructBLIP, and Meta&#39;s Chameleon, respectively. IDEATOR uncovers specific vulnerabilities in VLMs under black-box conditions, underscoring the need for improved safety mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00827v2-abstract-full').style.display = 'none'; document.getElementById('2411.00827v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22938">arXiv:2410.22938</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22938">pdf</a>, <a href="https://arxiv.org/format/2410.22938">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffLight: A Partial Rewards Conditioned Diffusion Model for Traffic Signal Control with Missing Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hanyang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yang Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiaowei Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22938v2-abstract-short" style="display: inline;"> The application of reinforcement learning in traffic signal control (TSC) has been extensively researched and yielded notable achievements. However, most existing works for TSC assume that traffic data from all surrounding intersections is fully and continuously available through sensors. In real-world applications, this assumption often fails due to sensor malfunctions or data loss, making TSC wi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22938v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22938v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22938v2-abstract-full" style="display: none;"> The application of reinforcement learning in traffic signal control (TSC) has been extensively researched and yielded notable achievements. However, most existing works for TSC assume that traffic data from all surrounding intersections is fully and continuously available through sensors. In real-world applications, this assumption often fails due to sensor malfunctions or data loss, making TSC with missing data a critical challenge. To meet the needs of practical applications, we introduce DiffLight, a novel conditional diffusion model for TSC under data-missing scenarios in the offline setting. Specifically, we integrate two essential sub-tasks, i.e., traffic data imputation and decision-making, by leveraging a Partial Rewards Conditioned Diffusion (PRCD) model to prevent missing rewards from interfering with the learning process. Meanwhile, to effectively capture the spatial-temporal dependencies among intersections, we design a Spatial-Temporal transFormer (STFormer) architecture. In addition, we propose a Diffusion Communication Mechanism (DCM) to promote better communication and control performance under data-missing scenarios. Extensive experiments on five datasets with various data-missing scenarios demonstrate that DiffLight is an effective controller to address TSC with missing data. The code of DiffLight is released at https://github.com/lokol5579/DiffLight-release. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22938v2-abstract-full').style.display = 'none'; document.getElementById('2410.22938v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22629">arXiv:2410.22629</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22629">pdf</a>, <a href="https://arxiv.org/format/2410.22629">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable Remote Sensing Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gong%2C+Z">Ziyang Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Zhixiang Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xianzheng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongruixuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yuru Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Y">Yupeng Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+Z">Zhenming Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiangwei Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yokoya%2C+N">Naoto Yokoya</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liangpei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22629v2-abstract-short" style="display: inline;"> The field of Remote Sensing Domain Generalization (RSDG) has emerged as a critical and valuable research frontier, focusing on developing models that generalize effectively across diverse scenarios. Despite the substantial domain gaps in RS images that are characterized by variabilities such as location, wavelength, and sensor type, research in this area remains underexplored: (1) Current cross-do&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22629v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22629v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22629v2-abstract-full" style="display: none;"> The field of Remote Sensing Domain Generalization (RSDG) has emerged as a critical and valuable research frontier, focusing on developing models that generalize effectively across diverse scenarios. Despite the substantial domain gaps in RS images that are characterized by variabilities such as location, wavelength, and sensor type, research in this area remains underexplored: (1) Current cross-domain methods primarily focus on Domain Adaptation (DA), which adapts models to predefined domains rather than to unseen ones; (2) Few studies targeting the RSDG issue, especially for semantic segmentation tasks, where existing models are developed for specific unknown domains, struggling with issues of underfitting on other unknown scenarios; (3) Existing RS foundation models tend to prioritize in-domain performance over cross-domain generalization. To this end, we introduce the first vision foundation model for RSDG semantic segmentation, CrossEarth. CrossEarth demonstrates strong cross-domain generalization through a specially designed data-level Earth-Style Injection pipeline and a model-level Multi-Task Training pipeline. In addition, for the semantic segmentation task, we have curated an RSDG benchmark comprising 28 cross-domain settings across various regions, spectral bands, platforms, and climates, providing a comprehensive framework for testing the generalizability of future RSDG models. Extensive experiments on this benchmark demonstrate the superiority of CrossEarth over existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22629v2-abstract-full').style.display = 'none'; document.getElementById('2410.22629v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The codes and models will be available at https://github.com/Cuzyoung/CrossEarth</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21840">arXiv:2410.21840</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21840">pdf</a>, <a href="https://arxiv.org/format/2410.21840">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Optimized Homomorphic Vector Permutation From New Decomposition Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xirong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+J">Junling Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Duong%2C+D+H">Dung Hoang Duong</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yali Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Ge%2C+C">Chunpeng Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21840v3-abstract-short" style="display: inline;"> Homomorphic vector permutation is fundamental to privacy-preserving computations based on batch-encoded homomorphic encryption, underpinning nearly all homomorphic matrix operation algorithms and predominantly influencing their complexity. A potential approach to optimize this critical component lies in permutation decomposition, a technique we consider as not yet fully explored. In this paper, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21840v3-abstract-full').style.display = 'inline'; document.getElementById('2410.21840v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21840v3-abstract-full" style="display: none;"> Homomorphic vector permutation is fundamental to privacy-preserving computations based on batch-encoded homomorphic encryption, underpinning nearly all homomorphic matrix operation algorithms and predominantly influencing their complexity. A potential approach to optimize this critical component lies in permutation decomposition, a technique we consider as not yet fully explored. In this paper, we enhance the efficiency of homomorphic permutations through novel decomposition techniques, thus advancing privacy-preserving computations. We start by estimating the ideal performance of decompositions on permutations and proposing an algorithm that searches depth-1 ideal decomposition solutions. This enables us to ascertain the full-depth ideal decomposability of specific permutations in homomorphic matrix transposition (SIGSAC 18) and multiplication (CCSW 22), allowing these privacy-preserving computations to achieve asymptotic improvement in speed and rotation key reduction. We further devise a new method for computing arbitrary homomorphic permutations, aiming to approximate the performance of ideal decomposition, as permutations with weak structures are unlikely to be ideally factorized. Our design deviates from the conventional scope of permutation decomposition. It outperforms state-of-the-art techniques (EUROCRYPT 12, CRYPTO 14) with a speed-up of up to $\times2.27$ under the minimum requirement of rotation keys. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21840v3-abstract-full').style.display = 'none'; document.getElementById('2410.21840v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submission on 11/11/2024, context and authors&#39; info updated</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21438">arXiv:2410.21438</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21438">pdf</a>, <a href="https://arxiv.org/format/2410.21438">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UFT: Unifying Fine-Tuning of SFT and RLHF/DPO/UNA through a Generalized Implicit Reward Function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+B">Bin Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zixu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiangbo Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shiyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21438v1-abstract-short" style="display: inline;"> By pretraining on trillions of tokens, an LLM gains the capability of text generation. However, to enhance its utility and reduce potential harm, SFT and alignment are applied sequentially to the pretrained model. Due to the differing nature and objective functions of SFT and alignment, catastrophic forgetting has become a significant issue. To address this, we introduce Unified Fine-Tuning (UFT),&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21438v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21438v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21438v1-abstract-full" style="display: none;"> By pretraining on trillions of tokens, an LLM gains the capability of text generation. However, to enhance its utility and reduce potential harm, SFT and alignment are applied sequentially to the pretrained model. Due to the differing nature and objective functions of SFT and alignment, catastrophic forgetting has become a significant issue. To address this, we introduce Unified Fine-Tuning (UFT), which integrates SFT and alignment into a single training stage using the same objective and loss functions through an implicit reward function. Our experimental results demonstrate that UFT outperforms SFT on instruction-tuning data alone. Moreover, when combining instruction-tuning data with alignment data, UFT effectively prevents catastrophic forgetting across these two stages and shows a clear advantage over sequentially applying SFT and alignment. This is evident in the significant improvements observed in the \textbf{ifeval} task for instruction-following and the \textbf{truthful-qa} task for factuality. The proposed general fine-tuning framework UFT establishes an effective and efficient pretraining-UFT paradigm for LLM training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21438v1-abstract-full').style.display = 'none'; document.getElementById('2410.21438v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20971">arXiv:2410.20971</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20971">pdf</a>, <a href="https://arxiv.org/format/2410.20971">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BlueSuffix: Reinforced Blue Teaming for Vision-Language Models Against Jailbreak Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yunhan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+X">Xiang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lin Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20971v1-abstract-short" style="display: inline;"> Despite their superb multimodal capabilities, Vision-Language Models (VLMs) have been shown to be vulnerable to jailbreak attacks, which are inference-time attacks that induce the model to output harmful responses with tricky prompts. It is thus essential to defend VLMs against potential jailbreaks for their trustworthy deployment in real-world applications. In this work, we focus on black-box def&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20971v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20971v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20971v1-abstract-full" style="display: none;"> Despite their superb multimodal capabilities, Vision-Language Models (VLMs) have been shown to be vulnerable to jailbreak attacks, which are inference-time attacks that induce the model to output harmful responses with tricky prompts. It is thus essential to defend VLMs against potential jailbreaks for their trustworthy deployment in real-world applications. In this work, we focus on black-box defense for VLMs against jailbreak attacks. Existing black-box defense methods are either unimodal or bimodal. Unimodal methods enhance either the vision or language module of the VLM, while bimodal methods robustify the model through text-image representation realignment. However, these methods suffer from two limitations: 1) they fail to fully exploit the cross-modal information, or 2) they degrade the model performance on benign inputs. To address these limitations, we propose a novel blue-team method BlueSuffix that defends the black-box target VLM against jailbreak attacks without compromising its performance. BlueSuffix includes three key components: 1) a visual purifier against jailbreak images, 2) a textual purifier against jailbreak texts, and 3) a blue-team suffix generator fine-tuned via reinforcement learning for enhancing cross-modal robustness. We empirically show on three VLMs (LLaVA, MiniGPT-4, and Gemini) and two safety benchmarks (MM-SafetyBench and RedTeam-2K) that BlueSuffix outperforms the baseline defenses by a significant margin. Our BlueSuffix opens up a promising direction for defending VLMs against jailbreak attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20971v1-abstract-full').style.display = 'none'; document.getElementById('2410.20971v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20957">arXiv:2410.20957</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20957">pdf</a>, <a href="https://arxiv.org/format/2410.20957">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neuro-symbolic Learning Yielding Logical Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yunpeng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Taolue Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoxing Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20957v1-abstract-short" style="display: inline;"> Neuro-symbolic systems combine the abilities of neural perception and logical reasoning. However, end-to-end learning of neuro-symbolic systems is still an unsolved challenge. This paper proposes a natural framework that fuses neural network training, symbol grounding, and logical constraint synthesis into a coherent and efficient end-to-end learning process. The capability of this framework comes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20957v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20957v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20957v1-abstract-full" style="display: none;"> Neuro-symbolic systems combine the abilities of neural perception and logical reasoning. However, end-to-end learning of neuro-symbolic systems is still an unsolved challenge. This paper proposes a natural framework that fuses neural network training, symbol grounding, and logical constraint synthesis into a coherent and efficient end-to-end learning process. The capability of this framework comes from the improved interactions between the neural and the symbolic parts of the system in both the training and inference stages. Technically, to bridge the gap between the continuous neural network and the discrete logical constraint, we introduce a difference-of-convex programming technique to relax the logical constraints while maintaining their precision. We also employ cardinality constraints as the language for logical constraint learning and incorporate a trust region method to avoid the degeneracy of logical constraint in learning. Both theoretical analyses and empirical evaluations substantiate the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20957v1-abstract-full').style.display = 'none'; document.getElementById('2410.20957v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2023, and code is available at [this url](https://github.com/Lizn-zn/Nesy-Programming)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20936">arXiv:2410.20936</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20936">pdf</a>, <a href="https://arxiv.org/format/2410.20936">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Autoformalize Mathematical Statements by Symbolic Equivalence and Semantic Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+X">Xinming Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoxing Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20936v1-abstract-short" style="display: inline;"> Autoformalization, the task of automatically translating natural language descriptions into a formal language, poses a significant challenge across various domains, especially in mathematics. Recent advancements in large language models (LLMs) have unveiled their promising capabilities to formalize even competition-level math problems. However, we observe a considerable discrepancy between pass@1&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20936v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20936v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20936v1-abstract-full" style="display: none;"> Autoformalization, the task of automatically translating natural language descriptions into a formal language, poses a significant challenge across various domains, especially in mathematics. Recent advancements in large language models (LLMs) have unveiled their promising capabilities to formalize even competition-level math problems. However, we observe a considerable discrepancy between pass@1 and pass@k accuracies in LLM-generated formalizations. To address this gap, we introduce a novel framework that scores and selects the best result from k autoformalization candidates based on two complementary self-consistency methods: symbolic equivalence and semantic consistency. Elaborately, symbolic equivalence identifies the logical homogeneity among autoformalization candidates using automated theorem provers, and semantic consistency evaluates the preservation of the original meaning by informalizing the candidates and computing the similarity between the embeddings of the original and informalized texts. Our extensive experiments on the MATH and miniF2F datasets demonstrate that our approach significantly enhances autoformalization accuracy, achieving up to 0.22-1.35x relative improvements across various LLMs and baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20936v1-abstract-full').style.display = 'none'; document.getElementById('2410.20936v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2024. Code is available at [this https URL](https://github.com/Miracle-Messi/Isa-AutoFormal)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20374">arXiv:2410.20374</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20374">pdf</a>, <a href="https://arxiv.org/format/2410.20374">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A CT-guided Control Framework of a Robotic Flexible Endoscope for the Diagnosis of the Maxillary Sinusitis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+P">Puchen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Huayu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xin Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+X">Xiaoyin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xuchen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Au%2C+K+W+S">Kwok Wai Samuel Au</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20374v1-abstract-short" style="display: inline;"> Flexible endoscopes are commonly adopted in narrow and confined anatomical cavities due to their higher reachability and dexterity. However, prolonged and unintuitive manipulation of these endoscopes leads to an increased workload on surgeons and risks of collision. To address these challenges, this paper proposes a CT-guided control framework for the diagnosis of maxillary sinusitis by using a ro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20374v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20374v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20374v1-abstract-full" style="display: none;"> Flexible endoscopes are commonly adopted in narrow and confined anatomical cavities due to their higher reachability and dexterity. However, prolonged and unintuitive manipulation of these endoscopes leads to an increased workload on surgeons and risks of collision. To address these challenges, this paper proposes a CT-guided control framework for the diagnosis of maxillary sinusitis by using a robotic flexible endoscope. In the CT-guided control framework, a feasible path to the target position in the maxillary sinus cavity for the robotic flexible endoscope is designed. Besides, an optimal control scheme is proposed to autonomously control the robotic flexible endoscope to follow the feasible path. This greatly improves the efficiency and reduces the workload for surgeons. Several experiments were conducted based on a widely utilized sinus phantom, and the results showed that the robotic flexible endoscope can accurately and autonomously follow the feasible path and reach the target position in the maxillary sinus cavity. The results also verified the feasibility of the CT-guided control framework, which contributes an effective approach to early diagnosis of sinusitis in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20374v1-abstract-full').style.display = 'none'; document.getElementById('2410.20374v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19742">arXiv:2410.19742</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19742">pdf</a>, <a href="https://arxiv.org/format/2410.19742">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3666025.3699323">10.1145/3666025.3699323 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SALINA: Towards Sustainable Live Sonar Analytics in Wild Ecosystems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+R">Rongsheng Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoqiang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Atlas%2C+W+I">William I. Atlas</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiangchuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Spoljaric%2C+M+A">Mark A. Spoljaric</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19742v1-abstract-short" style="display: inline;"> Sonar radar captures visual representations of underwater objects and structures using sound wave reflections, making it essential for exploration, mapping, and continuous surveillance in wild ecosystems. Real-time analysis of sonar data is crucial for time-sensitive applications, including environmental anomaly detection and in-season fishery management, where rapid decision-making is needed. How&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19742v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19742v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19742v1-abstract-full" style="display: none;"> Sonar radar captures visual representations of underwater objects and structures using sound wave reflections, making it essential for exploration, mapping, and continuous surveillance in wild ecosystems. Real-time analysis of sonar data is crucial for time-sensitive applications, including environmental anomaly detection and in-season fishery management, where rapid decision-making is needed. However, the lack of both relevant datasets and pre-trained DNN models, coupled with resource limitations in wild environments, hinders the effective deployment and continuous operation of live sonar analytics. We present SALINA, a sustainable live sonar analytics system designed to address these challenges. SALINA enables real-time processing of acoustic sonar data with spatial and temporal adaptations, and features energy-efficient operation through a robust energy management module. Deployed for six months at two inland rivers in British Columbia, Canada, SALINA provided continuous 24/7 underwater monitoring, supporting fishery stewardship and wildlife restoration efforts. Through extensive real-world testing, SALINA demonstrated an up to 9.5% improvement in average precision and a 10.1% increase in tracking metrics. The energy management module successfully handled extreme weather, preventing outages and reducing contingency costs. These results offer valuable insights for long-term deployment of acoustic data systems in the wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19742v1-abstract-full').style.display = 'none'; document.getElementById('2410.19742v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, accepted by ACM SenSys 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19427">arXiv:2410.19427</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19427">pdf</a>, <a href="https://arxiv.org/format/2410.19427">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Expose Before You Defend: Unifying and Enhancing Backdoor Defenses via Exposed Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yige Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Hanxun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19427v1-abstract-short" style="display: inline;"> Backdoor attacks covertly implant triggers into deep neural networks (DNNs) by poisoning a small portion of the training data with pre-designed backdoor triggers. This vulnerability is exacerbated in the era of large models, where extensive (pre-)training on web-crawled datasets is susceptible to compromise. In this paper, we introduce a novel two-step defense framework named Expose Before You Def&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19427v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19427v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19427v1-abstract-full" style="display: none;"> Backdoor attacks covertly implant triggers into deep neural networks (DNNs) by poisoning a small portion of the training data with pre-designed backdoor triggers. This vulnerability is exacerbated in the era of large models, where extensive (pre-)training on web-crawled datasets is susceptible to compromise. In this paper, we introduce a novel two-step defense framework named Expose Before You Defend (EBYD). EBYD unifies existing backdoor defense methods into a comprehensive defense system with enhanced performance. Specifically, EBYD first exposes the backdoor functionality in the backdoored model through a model preprocessing step called backdoor exposure, and then applies detection and removal methods to the exposed model to identify and eliminate the backdoor features. In the first step of backdoor exposure, we propose a novel technique called Clean Unlearning (CUL), which proactively unlearns clean features from the backdoored model to reveal the hidden backdoor features. We also explore various model editing/modification techniques for backdoor exposure, including fine-tuning, model sparsification, and weight perturbation. Using EBYD, we conduct extensive experiments on 10 image attacks and 6 text attacks across 2 vision datasets (CIFAR-10 and an ImageNet subset) and 4 language datasets (SST-2, IMDB, Twitter, and AG&#39;s News). The results demonstrate the importance of backdoor exposure for backdoor defense, showing that the exposed models can significantly benefit a range of downstream defense tasks, including backdoor label detection, backdoor trigger recovery, backdoor model detection, and backdoor removal. We hope our work could inspire more research in developing advanced defense frameworks with exposed models. Our code is available at: https://github.com/bboylyg/Expose-Before-You-Defend. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19427v1-abstract-full').style.display = 'none'; document.getElementById('2410.19427v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18615">arXiv:2410.18615</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18615">pdf</a>, <a href="https://arxiv.org/format/2410.18615">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FairQueue: Rethinking Prompt Learning for Fair Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Teo%2C+C+T+H">Christopher T. H Teo</a>, <a href="/search/cs?searchtype=author&amp;query=Abdollahzadeh%2C+M">Milad Abdollahzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xinda Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Cheung%2C+N">Ngai-man Cheung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18615v1-abstract-short" style="display: inline;"> Recently, prompt learning has emerged as the state-of-the-art (SOTA) for fair text-to-image (T2I) generation. Specifically, this approach leverages readily available reference images to learn inclusive prompts for each target Sensitive Attribute (tSA), allowing for fair image generation. In this work, we first reveal that this prompt learning-based approach results in degraded sample quality. Our&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18615v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18615v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18615v1-abstract-full" style="display: none;"> Recently, prompt learning has emerged as the state-of-the-art (SOTA) for fair text-to-image (T2I) generation. Specifically, this approach leverages readily available reference images to learn inclusive prompts for each target Sensitive Attribute (tSA), allowing for fair image generation. In this work, we first reveal that this prompt learning-based approach results in degraded sample quality. Our analysis shows that the approach&#39;s training objective -- which aims to align the embedding differences of learned prompts and reference images -- could be sub-optimal, resulting in distortion of the learned prompts and degraded generated images. To further substantiate this claim, as our major contribution, we deep dive into the denoising subnetwork of the T2I model to track down the effect of these learned prompts by analyzing the cross-attention maps. In our analysis, we propose a novel prompt switching analysis: I2H and H2I. Furthermore, we propose new quantitative characterization of cross-attention maps. Our analysis reveals abnormalities in the early denoising steps, perpetuating improper global structure that results in degradation in the generated samples. Building on insights from our analysis, we propose two ideas: (i) Prompt Queuing and (ii) Attention Amplification to address the quality issue. Extensive experimental results on a wide range of tSAs show that our proposed method outperforms SOTA approach&#39;s image generation quality, while achieving competitive fairness. More resources at FairQueue Project site: https://sutd-visual-computing-group.github.io/FairQueue <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18615v1-abstract-full').style.display = 'none'; document.getElementById('2410.18615v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in NeurIPS24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17856">arXiv:2410.17856</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17856">pdf</a>, <a href="https://arxiv.org/format/2410.17856">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ROCKET-1: Mastering Open-World Interaction with Visual-Temporal Context Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+S">Shaofei Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+K">Kewei Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+Z">Zhancun Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaojian Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Anji Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yitao Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17856v2-abstract-short" style="display: inline;"> Vision-language models (VLMs) have excelled in multimodal tasks, but adapting them to embodied decision-making in open-world environments presents challenges. One critical issue is bridging the gap between discrete entities in low-level observations and the abstract concepts required for effective planning. A common solution is building hierarchical agents, where VLMs serve as high-level reasoners&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17856v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17856v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17856v2-abstract-full" style="display: none;"> Vision-language models (VLMs) have excelled in multimodal tasks, but adapting them to embodied decision-making in open-world environments presents challenges. One critical issue is bridging the gap between discrete entities in low-level observations and the abstract concepts required for effective planning. A common solution is building hierarchical agents, where VLMs serve as high-level reasoners that break down tasks into executable sub-tasks, typically specified using language. However, language suffers from the inability to communicate detailed spatial information. We propose visual-temporal context prompting, a novel communication protocol between VLMs and policy models. This protocol leverages object segmentation from past observations to guide policy-environment interactions. Using this approach, we train ROCKET-1, a low-level policy that predicts actions based on concatenated visual observations and segmentation masks, supported by real-time object tracking from SAM-2. Our method unlocks the potential of VLMs, enabling them to tackle complex tasks that demand spatial reasoning. Experiments in Minecraft show that our approach enables agents to achieve previously unattainable tasks, with a $\mathbf{76}\%$ absolute improvement in open-world interaction performance. Codes and demos are now available on the project page: https://craftjarvis.github.io/ROCKET-1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17856v2-abstract-full').style.display = 'none'; document.getElementById('2410.17856v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17802">arXiv:2410.17802</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17802">pdf</a>, <a href="https://arxiv.org/format/2410.17802">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> GenUDC: High Quality 3D Mesh Generation with Unsigned Dual Contouring Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+D">Dan Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xueqi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zixiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Q">Qijun Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17802v1-abstract-short" style="display: inline;"> Generating high-quality meshes with complex structures and realistic surfaces is the primary goal of 3D generative models. Existing methods typically employ sequence data or deformable tetrahedral grids for mesh generation. However, sequence-based methods have difficulty producing complex structures with many faces due to memory limits. The deformable tetrahedral grid-based method MeshDiffusion fa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17802v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17802v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17802v1-abstract-full" style="display: none;"> Generating high-quality meshes with complex structures and realistic surfaces is the primary goal of 3D generative models. Existing methods typically employ sequence data or deformable tetrahedral grids for mesh generation. However, sequence-based methods have difficulty producing complex structures with many faces due to memory limits. The deformable tetrahedral grid-based method MeshDiffusion fails to recover realistic surfaces due to the inherent ambiguity in deformable grids. We propose the GenUDC framework to address these challenges by leveraging the Unsigned Dual Contouring (UDC) as the mesh representation. UDC discretizes a mesh in a regular grid and divides it into the face and vertex parts, recovering both complex structures and fine details. As a result, the one-to-one mapping between UDC and mesh resolves the ambiguity problem. In addition, GenUDC adopts a two-stage, coarse-to-fine generative process for 3D mesh generation. It first generates the face part as a rough shape and then the vertex part to craft a detailed shape. Extensive evaluations demonstrate the superiority of UDC as a mesh representation and the favorable performance of GenUDC in mesh generation. The code and trained models are available at https://github.com/TrepangCat/GenUDC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17802v1-abstract-full').style.display = 'none'; document.getElementById('2410.17802v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACMMM 2024, code:https://github.com/TrepangCat/GenUDC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17136">arXiv:2410.17136</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17136">pdf</a>, <a href="https://arxiv.org/format/2410.17136">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AlphaChimp: Tracking and Behavior Recognition of Chimpanzees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoxuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yutang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yuan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Kaufhold%2C+S+P">Stephan P. Kaufhold</a>, <a href="/search/cs?searchtype=author&amp;query=Terwilliger%2C+J">Jack Terwilliger</a>, <a href="/search/cs?searchtype=author&amp;query=Meza%2C+A">Andres Meza</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yixin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Rossano%2C+F">Federico Rossano</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17136v2-abstract-short" style="display: inline;"> Understanding non-human primate behavior is crucial for improving animal welfare, modeling social behavior, and gaining insights into both distinctly human and shared behaviors. Despite recent advances in computer vision, automated analysis of primate behavior remains challenging due to the complexity of their social interactions and the lack of specialized algorithms. Existing methods often strug&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17136v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17136v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17136v2-abstract-full" style="display: none;"> Understanding non-human primate behavior is crucial for improving animal welfare, modeling social behavior, and gaining insights into both distinctly human and shared behaviors. Despite recent advances in computer vision, automated analysis of primate behavior remains challenging due to the complexity of their social interactions and the lack of specialized algorithms. Existing methods often struggle with the nuanced behaviors and frequent occlusions characteristic of primate social dynamics. This study aims to develop an effective method for automated detection, tracking, and recognition of chimpanzee behaviors in video footage. Here we show that our proposed method, AlphaChimp, an end-to-end approach that simultaneously detects chimpanzee positions and estimates behavior categories from videos, significantly outperforms existing methods in behavior recognition. AlphaChimp achieves approximately 10% higher tracking accuracy and a 20% improvement in behavior recognition compared to state-of-the-art methods, particularly excelling in the recognition of social behaviors. This superior performance stems from AlphaChimp&#39;s innovative architecture, which integrates temporal feature fusion with a Transformer-based self-attention mechanism, enabling more effective capture and interpretation of complex social interactions among chimpanzees. Our approach bridges the gap between computer vision and primatology, enhancing technical capabilities and deepening our understanding of primate communication and sociality. We release our code and models and hope this will facilitate future research in animal social dynamics. This work contributes to ethology, cognitive science, and artificial intelligence, offering new perspectives on social intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17136v2-abstract-full').style.display = 'none'; document.getElementById('2410.17136v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An extension of ChimpACT [arXiv:2310.16447], proposes AlphaChimp for tracking and behavior recognition of chimpanzees. arXiv admin note: substantial text overlap with arXiv:2310.16447</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16853">arXiv:2410.16853</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16853">pdf</a>, <a href="https://arxiv.org/format/2410.16853">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Modality Gap: Dimension Information Alignment and Sparse Spatial Constraint for Image-Text Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuemei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+L">Lexin Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Caiming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16853v1-abstract-short" style="display: inline;"> Many contrastive learning based models have achieved advanced performance in image-text matching tasks. The key of these models lies in analyzing the correlation between image-text pairs, which involves cross-modal interaction of embeddings in corresponding dimensions. However, the embeddings of different modalities are from different models or modules, and there is a significant modality gap. Dir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16853v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16853v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16853v1-abstract-full" style="display: none;"> Many contrastive learning based models have achieved advanced performance in image-text matching tasks. The key of these models lies in analyzing the correlation between image-text pairs, which involves cross-modal interaction of embeddings in corresponding dimensions. However, the embeddings of different modalities are from different models or modules, and there is a significant modality gap. Directly interacting such embeddings lacks rationality and may capture inaccurate correlation. Therefore, we propose a novel method called DIAS to bridge the modality gap from two aspects: (1) We align the information representation of embeddings from different modalities in corresponding dimension to ensure the correlation calculation is based on interactions of similar information. (2) The spatial constraints of inter- and intra-modalities unmatched pairs are introduced to ensure the effectiveness of semantic alignment of the model. Besides, a sparse correlation algorithm is proposed to select strong correlated spatial relationships, enabling the model to learn more significant features and avoid being misled by weak correlation. Extensive experiments demonstrate the superiority of DIAS, achieving 4.3\%-10.2\% rSum improvements on Flickr30k and MSCOCO benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16853v1-abstract-full').style.display = 'none'; document.getElementById('2410.16853v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16197">arXiv:2410.16197</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16197">pdf</a>, <a href="https://arxiv.org/format/2410.16197">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> LASER: Script Execution by Autonomous Agents for On-demand Traffic Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Hao Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingyue Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+W">Wenyang Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yunpeng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Taolue Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoxing Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16197v3-abstract-short" style="display: inline;"> Autonomous Driving Systems (ADS) require diverse and safety-critical traffic scenarios for effective training and testing, but the existing data generation methods struggle to provide flexibility and scalability. We propose LASER, a novel frame-work that leverage large language models (LLMs) to conduct traffic simulations based on natural language inputs. The framework operates in two stages: it f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16197v3-abstract-full').style.display = 'inline'; document.getElementById('2410.16197v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16197v3-abstract-full" style="display: none;"> Autonomous Driving Systems (ADS) require diverse and safety-critical traffic scenarios for effective training and testing, but the existing data generation methods struggle to provide flexibility and scalability. We propose LASER, a novel frame-work that leverage large language models (LLMs) to conduct traffic simulations based on natural language inputs. The framework operates in two stages: it first generates scripts from user-provided descriptions and then executes them using autonomous agents in real time. Validated in the CARLA simulator, LASER successfully generates complex, on-demand driving scenarios, significantly improving ADS training and testing data generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16197v3-abstract-full').style.display = 'none'; document.getElementById('2410.16197v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15859">arXiv:2410.15859</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15859">pdf</a>, <a href="https://arxiv.org/format/2410.15859">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mesa-Extrapolation: A Weave Position Encoding Method for Enhanced Extrapolation in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xin Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jingjing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoxu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15859v3-abstract-short" style="display: inline;"> Large language models (LLMs), although having revolutionized many fields, still suffer from the challenging extrapolation problem, where the inference ability of LLMs sharply declines beyond their max training lengths. In this work, we conduct a theoretical analysis to better understand why No Position Encoding (NoPE) fails outside its effective range, as well as examining the power of Position En&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15859v3-abstract-full').style.display = 'inline'; document.getElementById('2410.15859v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15859v3-abstract-full" style="display: none;"> Large language models (LLMs), although having revolutionized many fields, still suffer from the challenging extrapolation problem, where the inference ability of LLMs sharply declines beyond their max training lengths. In this work, we conduct a theoretical analysis to better understand why No Position Encoding (NoPE) fails outside its effective range, as well as examining the power of Position Encoding (PE) in this context. Our findings reveal that with meticulous weave position, PE can indeed be extended beyond effective range. Our theorems establish that LLMs equipped with weave PE can achieve improved extrapolation performance without additional cost. Furthermore, we introduce a novel weave PE method, Mesa-Extrapolation, which utilizes a chunk-based triangular attention matrix and applies Stair PE to manage the final chunk. This method not only retains competitive performance but also offers substantial benefits such as significantly reduced memory demand and faster inference speed. Extensive experiments validate the effectiveness of Mesa-Extrapolation, demonstrating its potential as a scalable solution to enhancing LLMs applicative reach. Our code is available at \url{https://github.com/soacker/Mesa-Extrapolation}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15859v3-abstract-full').style.display = 'none'; document.getElementById('2410.15859v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024; 13 pages and 30 pages appendix;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15287">arXiv:2410.15287</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15287">pdf</a>, <a href="https://arxiv.org/format/2410.15287">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training Language Models to Critique With Multi-agent Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wenwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+C">Chengqi Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuaibin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heyan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xian-Ling Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15287v1-abstract-short" style="display: inline;"> Critique ability, a meta-cognitive capability of humans, presents significant challenges for LLMs to improve. Recent works primarily rely on supervised fine-tuning (SFT) using critiques generated by a single LLM like GPT-4. However, these model-generated critiques often exhibit flaws due to the inherent complexity of the critique. Consequently, fine-tuning LLMs on such flawed critiques typically l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15287v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15287v1-abstract-full" style="display: none;"> Critique ability, a meta-cognitive capability of humans, presents significant challenges for LLMs to improve. Recent works primarily rely on supervised fine-tuning (SFT) using critiques generated by a single LLM like GPT-4. However, these model-generated critiques often exhibit flaws due to the inherent complexity of the critique. Consequently, fine-tuning LLMs on such flawed critiques typically limits the model&#39;s performance and propagates these flaws into the learned model. To overcome these challenges, this paper proposes a novel data generation pipeline, named MultiCritique, that improves the critique ability of LLMs by utilizing multi-agent feedback in both the SFT and reinforcement learning (RL) stages. First, our data generation pipeline aggregates high-quality critiques from multiple agents instead of a single model, with crucial information as input for simplifying the critique. Furthermore, our pipeline improves the preference accuracy of critique quality through multi-agent feedback, facilitating the effectiveness of RL in improving the critique ability of LLMs. Based on our proposed MultiCritique data generation pipeline, we construct the MultiCritiqueDataset for the SFT and RL fine-tuning stages. Extensive experimental results on two benchmarks demonstrate: 1) the superior quality of our constructed SFT dataset compared to existing critique datasets; 2) additional improvements to the critique ability of LLMs brought by the RL stage. Notably, our fine-tuned 7B model significantly surpasses other advanced 7B-13B open-source models, approaching the performance of advanced 70B LLMs and GPT-4. Codes, datasets and model weights will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15287v1-abstract-full').style.display = 'none'; document.getElementById('2410.15287v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14989">arXiv:2410.14989</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14989">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> AutoFPDesigner: Automated Flight Procedure Design Based on Multi-Agent Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Longtao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongyu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+G">Ge Song</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xin Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanxin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+Y">Yulong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14989v1-abstract-short" style="display: inline;"> Current flight procedure design methods heavily rely on human-led design process, which is not only low auto-mation but also suffer from complex algorithm modelling and poor generalization. To address these challenges, this paper proposes an agent-driven flight procedure design method based on large language model, named Au-toFPDesigner, which utilizes multi-agent collaboration to complete procedu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14989v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14989v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14989v1-abstract-full" style="display: none;"> Current flight procedure design methods heavily rely on human-led design process, which is not only low auto-mation but also suffer from complex algorithm modelling and poor generalization. To address these challenges, this paper proposes an agent-driven flight procedure design method based on large language model, named Au-toFPDesigner, which utilizes multi-agent collaboration to complete procedure design. The method enables end-to-end automated design of performance-based navigation (PBN) procedures. In this process, the user input the design requirements in natural language, AutoFPDesigner models the flight procedure design by loading the design speci-fications and utilizing tool libraries complete the design. AutoFPDesigner allows users to oversee and seamlessly participate in the design process. Experimental results show that AutoFPDesigner ensures nearly 100% safety in the designed flight procedures and achieves 75% task completion rate, with good adaptability across different design tasks. AutoFPDesigner introduces a new paradigm for flight procedure design and represents a key step towards the automation of this process. Keywords: Flight Procedure Design; Large Language Model; Performance-Based Navigation (PBN); Multi Agent; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14989v1-abstract-full').style.display = 'none'; document.getElementById('2410.14989v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 18 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14723">arXiv:2410.14723</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14723">pdf</a>, <a href="https://arxiv.org/format/2410.14723">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BeniFul: Backdoor Defense via Middle Feature Analysis for Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xinfu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Junying Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xindi Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14723v1-abstract-short" style="display: inline;"> Backdoor defenses have recently become important in resisting backdoor attacks in deep neural networks (DNNs), where attackers implant backdoors into the DNN model by injecting backdoor samples into the training dataset. Although there are many defense methods to achieve backdoor detection for DNN inputs and backdoor elimination for DNN models, they still have not presented a clear explanation of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14723v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14723v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14723v1-abstract-full" style="display: none;"> Backdoor defenses have recently become important in resisting backdoor attacks in deep neural networks (DNNs), where attackers implant backdoors into the DNN model by injecting backdoor samples into the training dataset. Although there are many defense methods to achieve backdoor detection for DNN inputs and backdoor elimination for DNN models, they still have not presented a clear explanation of the relationship between these two missions. In this paper, we use the features from the middle layer of the DNN model to analyze the difference between backdoor and benign samples and propose Backdoor Consistency, which indicates that at least one backdoor exists in the DNN model if the backdoor trigger is detected exactly on input. By analyzing the middle features, we design an effective and comprehensive backdoor defense method named BeniFul, which consists of two parts: a gray-box backdoor input detection and a white-box backdoor elimination. Specifically, we use the reconstruction distance from the Variational Auto-Encoder and model inference results to implement backdoor input detection and a feature distance loss to achieve backdoor elimination. Experimental results on CIFAR-10 and Tiny ImageNet against five state-of-the-art attacks demonstrate that our BeniFul exhibits a great defense capability in backdoor input detection and backdoor elimination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14723v1-abstract-full').style.display = 'none'; document.getElementById('2410.14723v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14166">arXiv:2410.14166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14166">pdf</a>, <a href="https://arxiv.org/format/2410.14166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM The Genius Paradox: A Linguistic and Math Expert&#39;s Struggle with Simple Word-based Counting Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+N">Nan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xuezhe Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14166v1-abstract-short" style="display: inline;"> Interestingly, LLMs yet struggle with some basic tasks that humans find trivial to handle, e.g., counting the number of character r&#39;s in the word &#34;strawberry&#34;. There are several popular conjectures (e.g., tokenization, architecture and training data) regarding the reason for deficiency of LLMs in simple word-based counting problems, sharing the similar belief that such failure stems from model pre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14166v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14166v1-abstract-full" style="display: none;"> Interestingly, LLMs yet struggle with some basic tasks that humans find trivial to handle, e.g., counting the number of character r&#39;s in the word &#34;strawberry&#34;. There are several popular conjectures (e.g., tokenization, architecture and training data) regarding the reason for deficiency of LLMs in simple word-based counting problems, sharing the similar belief that such failure stems from model pretraining hence probably inevitable during deployment. In this paper, we carefully design multiple evaluation settings to investigate validity of prevalent conjectures. Meanwhile, we measure transferability of advanced mathematical and coding reasoning capabilities from specialized LLMs to simple counting tasks. Although specialized LLMs suffer from counting problems as well, we find conjectures about inherent deficiency of LLMs invalid and further seek opportunities to elicit knowledge and capabilities from LLMs that are beneficial to counting tasks. Compared with strategies such as finetuning and in-context learning that are commonly adopted to enhance performance on new or challenging tasks, we show that engaging reasoning is the most robust and efficient way to help LLMs better perceive tasks with more accurate responses. We hope our conjecture validation design could provide insights into the study of future critical failure modes of LLMs. Based on challenges in transferring advanced capabilities to much simpler tasks, we call for more attention to model capability acquisition and evaluation. We also highlight the importance of cultivating consciousness of &#34;reasoning before responding&#34; during model pretraining. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14166v1-abstract-full').style.display = 'none'; document.getElementById('2410.14166v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12855">arXiv:2410.12855</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12855">pdf</a>, <a href="https://arxiv.org/format/2410.12855">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> JAILJUDGE: A Comprehensive Jailbreak Judge Benchmark with Multi-Agent Enhanced Explanation Evaluation Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Y">Yue Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+L">Lixin Su</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xinyu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dawei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12855v2-abstract-short" style="display: inline;"> Despite advancements in enhancing LLM safety against jailbreak attacks, evaluating LLM defenses remains a challenge, with current methods often lacking explainability and generalization to complex scenarios, leading to incomplete assessments (e.g., direct judgment without reasoning, low F1 score of GPT-4 in complex cases, bias in multilingual scenarios). To address this, we present JAILJUDGE, a co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12855v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12855v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12855v2-abstract-full" style="display: none;"> Despite advancements in enhancing LLM safety against jailbreak attacks, evaluating LLM defenses remains a challenge, with current methods often lacking explainability and generalization to complex scenarios, leading to incomplete assessments (e.g., direct judgment without reasoning, low F1 score of GPT-4 in complex cases, bias in multilingual scenarios). To address this, we present JAILJUDGE, a comprehensive benchmark featuring diverse risk scenarios, including synthetic, adversarial, in-the-wild, and multilingual prompts, along with high-quality human-annotated datasets. The JAILJUDGE dataset includes over 35k+ instruction-tune data with reasoning explainability and JAILJUDGETEST, a 4.5k+ labeled set for risk scenarios, and a 6k+ multilingual set across ten languages. To enhance evaluation with explicit reasoning, we propose the JailJudge MultiAgent framework, which enables explainable, fine-grained scoring (1 to 10). This framework supports the construction of instruction-tuning ground truth and facilitates the development of JAILJUDGE Guard, an end-to-end judge model that provides reasoning and eliminates API costs. Additionally, we introduce JailBoost, an attacker-agnostic attack enhancer, and GuardShield, a moderation defense, both leveraging JAILJUDGE Guard. Our experiments demonstrate the state-of-the-art performance of JailJudge methods (JailJudge MultiAgent, JAILJUDGE Guard) across diverse models (e.g., GPT-4, Llama-Guard) and zero-shot scenarios. JailBoost and GuardShield significantly improve jailbreak attack and defense tasks under zero-shot settings, with JailBoost enhancing performance by 29.24% and GuardShield reducing defense ASR from 40.46% to 0.15%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12855v2-abstract-full').style.display = 'none'; document.getElementById('2410.12855v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11160">arXiv:2410.11160</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11160">pdf</a>, <a href="https://arxiv.org/format/2410.11160">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MANet: Fine-Tuning Segment Anything Model for Multimodal Remote Sensing Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xianping Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Pun%2C+M">Man-On Pun</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+B">Bo Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11160v1-abstract-short" style="display: inline;"> Multimodal remote sensing data, collected from a variety of sensors, provide a comprehensive and integrated perspective of the Earth&#39;s surface. By employing multimodal fusion techniques, semantic segmentation offers more detailed insights into geographic scenes compared to single-modality approaches. Building upon recent advancements in vision foundation models, particularly the Segment Anything M&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11160v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11160v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11160v1-abstract-full" style="display: none;"> Multimodal remote sensing data, collected from a variety of sensors, provide a comprehensive and integrated perspective of the Earth&#39;s surface. By employing multimodal fusion techniques, semantic segmentation offers more detailed insights into geographic scenes compared to single-modality approaches. Building upon recent advancements in vision foundation models, particularly the Segment Anything Model (SAM), this study introduces a novel Multimodal Adapter-based Network (MANet) for multimodal remote sensing semantic segmentation. At the core of this approach is the development of a Multimodal Adapter (MMAdapter), which fine-tunes SAM&#39;s image encoder to effectively leverage the model&#39;s general knowledge for multimodal data. In addition, a pyramid-based Deep Fusion Module (DFM) is incorporated to further integrate high-level geographic features across multiple scales before decoding. This work not only introduces a novel network for multimodal fusion, but also demonstrates, for the first time, SAM&#39;s powerful generalization capabilities with Digital Surface Model (DSM) data. Experimental results on two well-established fine-resolution multimodal remote sensing datasets, ISPRS Vaihingen and ISPRS Potsdam, confirm that the proposed MANet significantly surpasses current models in the task of multimodal semantic segmentation. The source code for this work will be accessible at https://github.com/sstary/SSRS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11160v1-abstract-full').style.display = 'none'; document.getElementById('2410.11160v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10360">arXiv:2410.10360</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10360">pdf</a>, <a href="https://arxiv.org/format/2410.10360">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Parenting: Optimizing Knowledge Selection of Retrieval-Augmented Language Models with Parameter Decoupling and Tailored Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yongxin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruizhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xinke Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Y">Yujie Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yuzhen Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xinyu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+R">Runchuan Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Junfeng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yasha Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10360v2-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) offers an effective solution to the issues faced by Large Language Models (LLMs) in hallucination generation and knowledge obsolescence by incorporating externally retrieved knowledge. However, existing methods lack effective control mechanisms for integrating internal and external knowledge. Inspired by human cognitive processes, we propose Parenting, a novel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10360v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10360v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10360v2-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) offers an effective solution to the issues faced by Large Language Models (LLMs) in hallucination generation and knowledge obsolescence by incorporating externally retrieved knowledge. However, existing methods lack effective control mechanisms for integrating internal and external knowledge. Inspired by human cognitive processes, we propose Parenting, a novel framework that decouples, identifies, and purposefully optimizes parameter subspaces related to adherence and robustness. Specifically, Parenting utilizes a key parameter mining method that combines forward and backward propagation signals to localize subspaces representing different capabilities. Then, Parenting employs a type-tailored tuning strategy, applying specific and appropriate optimizations to different subspaces, aiming to achieve a balanced enhancement of both adherence and robustness. Extensive experiments on various datasets and models validate the effectiveness and generalizability of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10360v2-abstract-full').style.display = 'none'; document.getElementById('2410.10360v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10127">arXiv:2410.10127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10127">pdf</a>, <a href="https://arxiv.org/format/2410.10127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MAIR: A Massive Benchmark for Evaluating Instructed Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Weiwei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Z">Zhengliang Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jiulong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+L">Lingyong Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xinyu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yiding Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+M">Min Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dawei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Z">Zhaochun Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10127v1-abstract-short" style="display: inline;"> Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Inst&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10127v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10127v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10127v1-abstract-full" style="display: none;"> Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Instructed Retrieval Benchmark), a heterogeneous IR benchmark that includes 126 distinct IR tasks across 6 domains, collected from existing datasets. We benchmark state-of-the-art instruction-tuned text embedding models and re-ranking models. Our experiments reveal that instruction-tuned models generally achieve superior performance compared to non-instruction-tuned models on MAIR. Additionally, our results suggest that current instruction-tuned text embedding models and re-ranking models still lack effectiveness in specific long-tail tasks. MAIR is publicly available at https://github.com/sunnweiwei/Mair. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10127v1-abstract-full').style.display = 'none'; document.getElementById('2410.10127v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09909">arXiv:2410.09909</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09909">pdf</a>, <a href="https://arxiv.org/format/2410.09909">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UnSeg: One Universal Unlearnable Example Generator is Enough against All Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Ye Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tiehua Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09909v1-abstract-short" style="display: inline;"> Image segmentation is a crucial vision task that groups pixels within an image into semantically meaningful segments, which is pivotal in obtaining a fine-grained understanding of real-world scenes. However, an increasing privacy concern exists regarding training large-scale image segmentation models on unauthorized private data. In this work, we exploit the concept of unlearnable examples to make&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09909v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09909v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09909v1-abstract-full" style="display: none;"> Image segmentation is a crucial vision task that groups pixels within an image into semantically meaningful segments, which is pivotal in obtaining a fine-grained understanding of real-world scenes. However, an increasing privacy concern exists regarding training large-scale image segmentation models on unauthorized private data. In this work, we exploit the concept of unlearnable examples to make images unusable to model training by generating and adding unlearnable noise into the original images. Particularly, we propose a novel Unlearnable Segmentation (UnSeg) framework to train a universal unlearnable noise generator that is capable of transforming any downstream images into their unlearnable version. The unlearnable noise generator is finetuned from the Segment Anything Model (SAM) via bilevel optimization on an interactive segmentation dataset towards minimizing the training error of a surrogate model that shares the same architecture with SAM but is trained from scratch. We empirically verify the effectiveness of UnSeg across 6 mainstream image segmentation tasks, 10 widely used datasets, and 7 different network architectures, and show that the unlearnable images can reduce the segmentation performance by a large margin. Our work provides useful insights into how to leverage foundation models in a data-efficient and computationally affordable manner to protect images against image segmentation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09909v1-abstract-full').style.display = 'none'; document.getElementById('2410.09909v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09418">arXiv:2410.09418</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09418">pdf</a>, <a href="https://arxiv.org/format/2410.09418">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Exact Match: Semantically Reassessing Event Extraction by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yi-Fan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xian-Ling Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heyan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09418v1-abstract-short" style="display: inline;"> Event extraction has gained extensive research attention due to its broad range of applications. However, the current mainstream evaluation method for event extraction relies on token-level exact match, which misjudges numerous semantic-level correct cases. This reliance leads to a significant discrepancy between the evaluated performance of models under exact match criteria and their real perform&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09418v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09418v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09418v1-abstract-full" style="display: none;"> Event extraction has gained extensive research attention due to its broad range of applications. However, the current mainstream evaluation method for event extraction relies on token-level exact match, which misjudges numerous semantic-level correct cases. This reliance leads to a significant discrepancy between the evaluated performance of models under exact match criteria and their real performance. To address this problem, we propose RAEE, an automatic evaluation framework that accurately assesses event extraction results at semantic-level instead of token-level. Specifically, RAEE leverages Large Language Models (LLMs) as automatic evaluation agents, incorporating chain-of-thought prompting and an adaptive mechanism to achieve interpretable and adaptive evaluations for precision and recall of triggers and arguments. Extensive experimental results demonstrate that: (1) RAEE achieves a very high correlation with the human average; (2) after reassessing 14 models, including advanced LLMs, on 10 datasets, there is a significant performance gap between exact match and RAEE. The exact match evaluation significantly underestimates the performance of existing event extraction models, particularly underestimating the capabilities of LLMs; (3) fine-grained analysis under RAEE evaluation reveals insightful phenomena worth further exploration. The evaluation toolkit of our proposed RAEE will be publicly released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09418v1-abstract-full').style.display = 'none'; document.getElementById('2410.09418v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08950">arXiv:2410.08950</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08950">pdf</a>, <a href="https://arxiv.org/format/2410.08950">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On the Adversarial Transferability of Generalized &#34;Skip Connections&#34; </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yisen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+Y">Yichuan Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+D">Dongxian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhouchen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08950v1-abstract-short" style="display: inline;"> Skip connection is an essential ingredient for modern deep models to be deeper and more powerful. Despite their huge success in normal scenarios (state-of-the-art classification performance on natural examples), we investigate and identify an interesting property of skip connections under adversarial scenarios, namely, the use of skip connections allows easier generation of highly transferable adv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08950v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08950v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08950v1-abstract-full" style="display: none;"> Skip connection is an essential ingredient for modern deep models to be deeper and more powerful. Despite their huge success in normal scenarios (state-of-the-art classification performance on natural examples), we investigate and identify an interesting property of skip connections under adversarial scenarios, namely, the use of skip connections allows easier generation of highly transferable adversarial examples. Specifically, in ResNet-like models (with skip connections), we find that using more gradients from the skip connections rather than the residual modules according to a decay factor during backpropagation allows one to craft adversarial examples with high transferability. The above method is termed as Skip Gradient Method (SGM). Although starting from ResNet-like models in vision domains, we further extend SGM to more advanced architectures, including Vision Transformers (ViTs) and models with length-varying paths and other domains, i.e. natural language processing. We conduct comprehensive transfer attacks against various models including ResNets, Transformers, Inceptions, Neural Architecture Search, and Large Language Models (LLMs). We show that employing SGM can greatly improve the transferability of crafted attacks in almost all cases. Furthermore, considering the big complexity for practical use, we further demonstrate that SGM can even improve the transferability on ensembles of models or targeted attacks and the stealthiness against current defenses. At last, we provide theoretical explanations and empirical insights on how SGM works. Our findings not only motivate new adversarial research into the architectural characteristics of models but also open up further challenges for secure model architecture design. Our code is available at https://github.com/mo666666/SGM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08950v1-abstract-full').style.display = 'none'; document.getElementById('2410.08950v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07677">arXiv:2410.07677</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07677">pdf</a>, <a href="https://arxiv.org/format/2410.07677">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Smart Audit System Empowered by LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+X">Xu Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaoxu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Huan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenlei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+P">Ping Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Si Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoning Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+J">Jiulong Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07677v1-abstract-short" style="display: inline;"> Manufacturing quality audits are pivotal for ensuring high product standards in mass production environments. Traditional auditing processes, however, are labor-intensive and reliant on human expertise, posing challenges in maintaining transparency, accountability, and continuous improvement across complex global supply chains. To address these challenges, we propose a smart audit system empowered&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07677v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07677v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07677v1-abstract-full" style="display: none;"> Manufacturing quality audits are pivotal for ensuring high product standards in mass production environments. Traditional auditing processes, however, are labor-intensive and reliant on human expertise, posing challenges in maintaining transparency, accountability, and continuous improvement across complex global supply chains. To address these challenges, we propose a smart audit system empowered by large language models (LLMs). Our approach introduces three innovations: a dynamic risk assessment model that streamlines audit procedures and optimizes resource allocation; a manufacturing compliance copilot that enhances data processing, retrieval, and evaluation for a self-evolving manufacturing knowledge base; and a Re-act framework commonality analysis agent that provides real-time, customized analysis to empower engineers with insights for supplier improvement. These enhancements elevate audit efficiency and effectiveness, with testing scenarios demonstrating an improvement of over 24%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07677v1-abstract-full').style.display = 'none'; document.getElementById('2410.07677v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Mao%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10