Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 742 results for author: <span class="mathjax">Dong, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Dong%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Dong, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Dong%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Dong, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17607">arXiv:2411.17607</a> <span> [<a href="https://arxiv.org/pdf/2411.17607">pdf</a>, <a href="https://arxiv.org/format/2411.17607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Scaling Speech-Text Pre-training with Synthetic Interleaved Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Aohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhengxiao Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingdao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shengmin Jiang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17607v1-abstract-short" style="display: inline;"> Speech language models (SpeechLMs) accept speech input and produce speech output, allowing for more natural human-computer interaction compared to text-based large language models (LLMs). Traditional approaches for developing SpeechLMs are constrained by the limited availability of unsupervised speech data and parallel speech-text data, which are significantly less abundant than text pre-training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17607v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17607v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17607v1-abstract-full" style="display: none;"> Speech language models (SpeechLMs) accept speech input and produce speech output, allowing for more natural human-computer interaction compared to text-based large language models (LLMs). Traditional approaches for developing SpeechLMs are constrained by the limited availability of unsupervised speech data and parallel speech-text data, which are significantly less abundant than text pre-training data, thereby limiting their scalability as LLMs. We propose a novel approach to scaling speech-text pre-training by leveraging large-scale synthetic interleaved data derived from text corpora, eliminating the need for parallel speech-text datasets. Our method efficiently constructs speech-text interleaved data by sampling text spans from existing text corpora and synthesizing corresponding speech spans using a text-to-token model, bypassing the need to generate actual speech. We also employ a supervised speech tokenizer derived from an automatic speech recognition (ASR) model by incorporating a vector-quantized bottleneck into the encoder. This supervised training approach results in discrete speech tokens with strong semantic preservation even at lower sampling rates (e.g. 12.5Hz), while still maintaining speech reconstruction quality. Starting from a pre-trained language model and scaling our pre-training to 1 trillion tokens (with 600B synthetic interleaved speech-text data), we achieve state-of-the-art performance in speech language modeling and spoken question answering, improving performance on spoken questions tasks from the previous SOTA of 13% (Moshi) to 31%. We further demonstrate that by fine-tuning the pre-trained model with speech dialogue data, we can develop an end-to-end spoken chatbot that achieves competitive performance comparable to existing baselines in both conversational abilities and speech quality, even operating exclusively in the speech domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17607v1-abstract-full').style.display = 'none'; document.getElementById('2411.17607v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16782">arXiv:2411.16782</a> <span> [<a href="https://arxiv.org/pdf/2411.16782">pdf</a>, <a href="https://arxiv.org/format/2411.16782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws for Black box Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chuan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huanran Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16782v1-abstract-short" style="display: inline;"> A longstanding problem of deep learning models is their vulnerability to adversarial examples, which are often generated by applying imperceptible perturbations to natural examples. Adversarial examples exhibit cross-model transferability, enabling to attack black-box models with limited information about their architectures and parameters. Model ensembling is an effective strategy to improve the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16782v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16782v1-abstract-full" style="display: none;"> A longstanding problem of deep learning models is their vulnerability to adversarial examples, which are often generated by applying imperceptible perturbations to natural examples. Adversarial examples exhibit cross-model transferability, enabling to attack black-box models with limited information about their architectures and parameters. Model ensembling is an effective strategy to improve the transferability by attacking multiple surrogate models simultaneously. However, as prior studies usually adopt few models in the ensemble, there remains an open question of whether scaling the number of models can further improve black-box attacks. Inspired by the findings in large foundation models, we investigate the scaling laws of black-box adversarial attacks in this work. By analyzing the relationship between the number of surrogate models and transferability of adversarial examples, we conclude with clear scaling laws, emphasizing the potential of using more surrogate models to enhance adversarial transferability. Extensive experiments verify the claims on standard image classifiers, multimodal large language models, and even proprietary models like GPT-4o, demonstrating consistent scaling effects and impressive attack success rates with more surrogate models. Further studies by visualization indicate that scaled attacks bring better interpretability in semantics, indicating that the common features of models are captured. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16782v1-abstract-full').style.display = 'none'; document.getElementById('2411.16782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15100">arXiv:2411.15100</a> <span> [<a href="https://arxiv.org/pdf/2411.15100">pdf</a>, <a href="https://arxiv.org/format/2411.15100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> XGrammar: Flexible and Efficient Structured Generation Engine for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yixin Dong</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C+F">Charlie F. Ruan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yaxing Cai</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+R">Ruihang Lai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziyi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15100v1-abstract-short" style="display: inline;"> The applications of LLM Agents are becoming increasingly complex and diverse, leading to a high demand for structured outputs that can be parsed into code, structured function calls, and embodied agent commands. These developments bring significant demands for structured generation in LLM inference. Context-free grammar is a flexible approach to enable structured generation via constrained decodin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15100v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15100v1-abstract-full" style="display: none;"> The applications of LLM Agents are becoming increasingly complex and diverse, leading to a high demand for structured outputs that can be parsed into code, structured function calls, and embodied agent commands. These developments bring significant demands for structured generation in LLM inference. Context-free grammar is a flexible approach to enable structured generation via constrained decoding. However, executing context-free grammar requires going through several stack states over all tokens in vocabulary during runtime, bringing non-negligible overhead for structured generation. In this paper, we propose XGrammar, a flexible and efficient structure generation engine for large language models. XGrammar accelerates context-free grammar execution by dividing the vocabulary into context-independent tokens that can be prechecked and context-dependent tokens that need to be interpreted during runtime. We further build transformations to expand the grammar context and reduce the number of context-independent tokens. Additionally, we build an efficient persistent stack to accelerate the context-dependent token checks. Finally, we co-design the grammar engine with LLM inference engine to overlap grammar computation with GPU executions. Evaluation results show that XGrammar can achieve up to 100x speedup over existing solutions. Combined with an LLM inference engine, it can generate near-zero overhead structure generation in end-to-end low-LLM serving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15100v1-abstract-full').style.display = 'none'; document.getElementById('2411.15100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14432">arXiv:2411.14432</a> <span> [<a href="https://arxiv.org/pdf/2411.14432">pdf</a>, <a href="https://arxiv.org/format/2411.14432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hai-Long Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Winston Hu</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14432v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14432v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14432v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14432v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V, an early effort to 1) scalably produce long and robust reasoning data for complex multi-modal tasks, and 2) an effective training pipeline to enhance the reasoning capabilities of multi-modal large language models (MLLMs). Specifically, to create long and structured reasoning data without human labor, we design a two-step pipeline with a progressive strategy to generate sufficiently long and diverse reasoning paths and a multi-granularity assessment method to ensure data quality. We observe that directly supervising MLLMs with such long and complex reasoning data will not yield ideal reasoning ability. To tackle this problem, we design a multi-agent system consisting of a reasoning agent dedicated to performing long-chain reasoning and a summary agent trained to judge and summarize reasoning results. We further incorporate an iterative DPO algorithm to enhance the reasoning agent's generation stability and quality. Based on the popular LLaVA-NeXT model and our stronger base MLLM, we demonstrate significant performance gains across challenging multi-modal benchmarks requiring visual reasoning. Benefiting from our multi-agent system, Insight-V can also easily maintain or improve performance on perception-focused multi-modal tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14432v1-abstract-full').style.display = 'none'; document.getElementById('2411.14432v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12359">arXiv:2411.12359</a> <span> [<a href="https://arxiv.org/pdf/2411.12359">pdf</a>, <a href="https://arxiv.org/format/2411.12359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TactV: A Class of Hybrid Terrestrial/Aerial Coaxial Tilt-Rotor Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yifei Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yimin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lixian Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yihang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12359v1-abstract-short" style="display: inline;"> To enhance the obstacle-crossing and endurance capabilities of vehicles operating in complex environments, this paper presents the design of a hybrid terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates advantages such as lightweight construction and high maneuverability. Unlike existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial dual-rotor design and features a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12359v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12359v1-abstract-full" style="display: none;"> To enhance the obstacle-crossing and endurance capabilities of vehicles operating in complex environments, this paper presents the design of a hybrid terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates advantages such as lightweight construction and high maneuverability. Unlike existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial dual-rotor design and features a spherical cage structure that encases the body, allowing for omnidirectional movement while further reducing its overall dimensions. To enable TactV to maneuver flexibly in aerial, planar, and inclined surfaces, we established corresponding dynamic and control models for each mode. Additionally, we leveraged TactV's tiltable center of gravity to design energy-saving and high-mobility modes for ground operations, thereby further enhancing its endurance. Experimental designs for both aerial and ground tests corroborated the superiority of TactV's movement capabilities and control strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12359v1-abstract-full').style.display = 'none'; document.getElementById('2411.12359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11406">arXiv:2411.11406</a> <span> [<a href="https://arxiv.org/pdf/2411.11406">pdf</a>, <a href="https://arxiv.org/format/2411.11406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Resource Gap: Deploying Advanced Imitation Learning Models onto Affordable Embedded Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+H">Haizhou Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruixiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhu-ang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongrui Zhu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruichen Deng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuhang Dong</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Z">Zeyu Pang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guyue Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11406v1-abstract-short" style="display: inline;"> Advanced imitation learning with structures like the transformer is increasingly demonstrating its advantages in robotics. However, deploying these large-scale models on embedded platforms remains a major challenge. In this paper, we propose a pipeline that facilitates the migration of advanced imitation learning algorithms to edge devices. The process is achieved via an efficient model compressio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11406v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11406v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11406v1-abstract-full" style="display: none;"> Advanced imitation learning with structures like the transformer is increasingly demonstrating its advantages in robotics. However, deploying these large-scale models on embedded platforms remains a major challenge. In this paper, we propose a pipeline that facilitates the migration of advanced imitation learning algorithms to edge devices. The process is achieved via an efficient model compression method and a practical asynchronous parallel method Temporal Ensemble with Dropped Actions (TEDA) that enhances the smoothness of operations. To show the efficiency of the proposed pipeline, large-scale imitation learning models are trained on a server and deployed on an edge device to complete various manipulation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11406v1-abstract-full').style.display = 'none'; document.getElementById('2411.11406v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 2024 IEEE International Conference on Robotics and Biomimetics (IEEE ROBIO 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10636">arXiv:2411.10636</a> <span> [<a href="https://arxiv.org/pdf/2411.10636">pdf</a>, <a href="https://arxiv.org/format/2411.10636">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gender Bias Mitigation for Bangla Classification Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Joy%2C+S+K+S">Sajib Kumar Saha Joy</a>, <a href="/search/cs?searchtype=author&query=Mahy%2C+A+H">Arman Hassan Mahy</a>, <a href="/search/cs?searchtype=author&query=Sultana%2C+M">Meherin Sultana</a>, <a href="/search/cs?searchtype=author&query=Abha%2C+A+M">Azizah Mamun Abha</a>, <a href="/search/cs?searchtype=author&query=Ahmmed%2C+M+P">MD Piyal Ahmmed</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a>, <a href="/search/cs?searchtype=author&query=Shahariar%2C+G+M">G M Shahariar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10636v1-abstract-short" style="display: inline;"> In this study, we investigate gender bias in Bangla pretrained language models, a largely under explored area in low-resource languages. To assess this bias, we applied gender-name swapping techniques to existing datasets, creating four manually annotated, task-specific datasets for sentiment analysis, toxicity detection, hate speech detection, and sarcasm detection. By altering names and gender-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10636v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10636v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10636v1-abstract-full" style="display: none;"> In this study, we investigate gender bias in Bangla pretrained language models, a largely under explored area in low-resource languages. To assess this bias, we applied gender-name swapping techniques to existing datasets, creating four manually annotated, task-specific datasets for sentiment analysis, toxicity detection, hate speech detection, and sarcasm detection. By altering names and gender-specific terms, we ensured these datasets were suitable for detecting and mitigating gender bias. We then proposed a joint loss optimization technique to mitigate gender bias across task-specific pretrained models. Our approach was evaluated against existing bias mitigation methods, with results showing that our technique not only effectively reduces bias but also maintains competitive accuracy compared to other baseline approaches. To promote further research, we have made both our implementation and datasets publicly available https://github.com/sajib-kumar/Gender-Bias-Mitigation-From-Bangla-PLM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10636v1-abstract-full').style.display = 'none'; document.getElementById('2411.10636v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08374">arXiv:2411.08374</a> <span> [<a href="https://arxiv.org/pdf/2411.08374">pdf</a>, <a href="https://arxiv.org/format/2411.08374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Federated Graph Learning with Graphless Clients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xingbo Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Binchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08374v1-abstract-short" style="display: inline;"> Federated Graph Learning (FGL) is tasked with training machine learning models, such as Graph Neural Networks (GNNs), for multiple clients, each with its own graph data. Existing methods usually assume that each client has both node features and graph structure of its graph data. In real-world scenarios, however, there exist federated systems where only a part of the clients have such data while o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08374v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08374v1-abstract-full" style="display: none;"> Federated Graph Learning (FGL) is tasked with training machine learning models, such as Graph Neural Networks (GNNs), for multiple clients, each with its own graph data. Existing methods usually assume that each client has both node features and graph structure of its graph data. In real-world scenarios, however, there exist federated systems where only a part of the clients have such data while other clients (i.e. graphless clients) may only have node features. This naturally leads to a novel problem in FGL: how to jointly train a model over distributed graph data with graphless clients? In this paper, we propose a novel framework FedGLS to tackle the problem in FGL with graphless clients. In FedGLS, we devise a local graph learner on each graphless client which learns the local graph structure with the structure knowledge transferred from other clients. To enable structure knowledge transfer, we design a GNN model and a feature encoder on each client. During local training, the feature encoder retains the local graph structure knowledge together with the GNN model via knowledge distillation, and the structure knowledge is transferred among clients in global update. Our extensive experiments demonstrate the superiority of the proposed FedGLS over five baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08374v1-abstract-full').style.display = 'none'; document.getElementById('2411.08374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Transactions on Machine Learning Research (TMLR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07741">arXiv:2411.07741</a> <span> [<a href="https://arxiv.org/pdf/2411.07741">pdf</a>, <a href="https://arxiv.org/format/2411.07741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Formal Languages and Automata Theory">cs.FL</span> </div> </div> <p class="title is-5 mathjax"> Vulnerabilities Analysis and Secure Controlling for Unmanned Aerial System Based on Reactive Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wei Dong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yanqi Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sirui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07741v1-abstract-short" style="display: inline;"> Complex Cyber-Physical System (CPS) such as Unmanned Aerial System (UAS) got rapid development these years, but also became vulnerable to GPS spoofing, packets injection, buffer-overflow and other malicious attacks. Ensuring the behaviors of UAS always keeping secure no matter how the environment changes, would be a prospective direction for UAS security. This paper aims at introducing a pattern-b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07741v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07741v1-abstract-full" style="display: none;"> Complex Cyber-Physical System (CPS) such as Unmanned Aerial System (UAS) got rapid development these years, but also became vulnerable to GPS spoofing, packets injection, buffer-overflow and other malicious attacks. Ensuring the behaviors of UAS always keeping secure no matter how the environment changes, would be a prospective direction for UAS security. This paper aims at introducing a pattern-based framework to describe the security properties of UAS, and presenting a reactive synthesis-based approach to implement the automatic generation of secure UAS controller. First, we study the operating mechanism of UAS and construct a high-level model consisting of actuator and monitor. Besides, we analyze the security threats of UAS from the perspective of hardware, software and cyber physics, and then summarize the corresponding specification patterns of security properties with LTL formulas. With the UAS model and security specification patterns, automatons for controller can be constructed by General Reactivity of Rank 1 (GR(1)) synthesis algorithm, which is a two-player game process between Unmanned Aerial Vehicle (UAV) and its environment. Finally, we expand the function of LTLMoP platform to implement the control simulation in multi-robot systems, providing secure behavior strategies under several attack scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07741v1-abstract-full').style.display = 'none'; document.getElementById('2411.07741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07724">arXiv:2411.07724</a> <span> [<a href="https://arxiv.org/pdf/2411.07724">pdf</a>, <a href="https://arxiv.org/format/2411.07724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Convergence Rate Analysis of LION </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiming Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huan Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouchen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07724v1-abstract-short" style="display: inline;"> The LION (evoLved sIgn mOmeNtum) optimizer for deep neural network training was found by Google via program search, with the simple sign update yet showing impressive performance in training large scale networks. Although previous studies have investigated its convergence properties, a comprehensive analysis, especially the convergence rate, is still desirable. Recognizing that LION can be regarde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07724v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07724v1-abstract-full" style="display: none;"> The LION (evoLved sIgn mOmeNtum) optimizer for deep neural network training was found by Google via program search, with the simple sign update yet showing impressive performance in training large scale networks. Although previous studies have investigated its convergence properties, a comprehensive analysis, especially the convergence rate, is still desirable. Recognizing that LION can be regarded as solving a specific constrained problem, this paper focuses on demonstrating its convergence to the Karush-Kuhn-Tucker (KKT) point at the rate of $\cal O(\sqrt{d}K^{-1/4})$ measured by gradient $\ell_1$ norm, where $d$ is the problem dimension and $K$ is the number of iteration steps. Step further, we remove the constraint and establish that LION converges to the critical point of the general unconstrained problem at the same rate. This rate not only delivers the currently optimal dependence on the problem dimension $d$ but also tightly matches the theoretical lower bound for nonconvex stochastic optimization algorithms, which is typically measured using the gradient $\ell_2$ norm, with respect to the number of iterations $K$. Through extensive experiments, we not only demonstrate that LION achieves lower loss and higher performance compared to standard SGD, but also empirically confirm that the gradient $\ell_1/\ell_2$ norm ratio aligns with $螛(\sqrt{d})$, thus proving that our convergence rate matches the theoretical lower bound with respect to $d$ in the empirical sense. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07724v1-abstract-full').style.display = 'none'; document.getElementById('2411.07724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07112">arXiv:2411.07112</a> <span> [<a href="https://arxiv.org/pdf/2411.07112">pdf</a>, <a href="https://arxiv.org/format/2411.07112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> ROCODE: Integrating Backtracking Mechanism and Program Analysis in Large Language Models for Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xue Jiang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yongding Tao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenpin Jiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07112v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved impressive performance in code generation recently, offering programmers revolutionary assistance in software development. However, due to the auto-regressive nature of LLMs, they are susceptible to error accumulation during code generation. Once an error is produced, LLMs can merely continue to generate the subsequent code conditioned on it, given their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07112v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07112v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved impressive performance in code generation recently, offering programmers revolutionary assistance in software development. However, due to the auto-regressive nature of LLMs, they are susceptible to error accumulation during code generation. Once an error is produced, LLMs can merely continue to generate the subsequent code conditioned on it, given their inability to adjust previous outputs. Existing LLM-based approaches typically consider post-revising after code generation, leading to the challenging resolution of accumulated errors and the significant wastage of resources. Ideally, LLMs should rollback and resolve the occurred error in time during code generation, rather than proceed on the basis of the error and wait for post-revising after generation. In this paper, we propose ROCODE, which integrates the backtracking mechanism and program analysis into LLMs for code generation. Specifically, we employ program analysis to perform incremental error detection during the generation process. When an error is detected, the backtracking mechanism is triggered to priming rollback strategies and constraint regeneration, thereby eliminating the error early and ensuring continued generation on the correct basis. Experiments on multiple code generation benchmarks show that ROCODE can significantly reduce the errors generated by LLMs, with a compilation pass rate of 99.1%. The test pass rate is improved by up to 23.8% compared to the best baseline approach. Compared to the post-revising baseline, the token cost is reduced by 19.3%. Moreover, our approach is model-agnostic and achieves consistent improvements across nine representative LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07112v1-abstract-full').style.display = 'none'; document.getElementById('2411.07112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06377">arXiv:2411.06377</a> <span> [<a href="https://arxiv.org/pdf/2411.06377">pdf</a>, <a href="https://arxiv.org/format/2411.06377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SymmeTac: Symmetric Color LED Driven Efficient Photometric Stereo Reconstruction Methods for Camera-based Tactile Sensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jieji Ren</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Heng Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zaiyan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinnuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yueshi Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningbin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Boxin Shi</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jiang Zou</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+G">Guoying Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06377v1-abstract-short" style="display: inline;"> Camera-based tactile sensors can provide high-density surface geometry and force information for robots in the interaction process with the target. However, most existing methods cannot achieve accurate reconstruction with high efficiency, impeding the applications in robots. To address these problems, we propose an efficient two-shot photometric stereo method based on symmetric color LED distribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06377v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06377v1-abstract-full" style="display: none;"> Camera-based tactile sensors can provide high-density surface geometry and force information for robots in the interaction process with the target. However, most existing methods cannot achieve accurate reconstruction with high efficiency, impeding the applications in robots. To address these problems, we propose an efficient two-shot photometric stereo method based on symmetric color LED distribution. Specifically, based on the sensing response curve of CMOS channels, we design orthogonal red and blue LEDs as illumination to acquire four observation maps using channel-splitting in a two-shot manner. Subsequently, we develop a two-shot photometric stereo theory, which can estimate accurate surface normal and greatly reduce the computing overhead in magnitude. Finally, leveraging the characteristics of the camera-based tactile sensor, we optimize the algorithm to be a highly efficient, pure addition operation. Simulation and real-world experiments demonstrate the advantages of our approach. Further details are available on: https://github.com/Tacxels/SymmeTac. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06377v1-abstract-full').style.display = 'none'; document.getElementById('2411.06377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04397">arXiv:2411.04397</a> <span> [<a href="https://arxiv.org/pdf/2411.04397">pdf</a>, <a href="https://arxiv.org/format/2411.04397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Bayesian Mixture Model of Temporal Point Processes with Determinantal Point Process Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiwei Dong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shaoxin Ye</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuwen Cao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Q">Qiyu Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongteng Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanfang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04397v1-abstract-short" style="display: inline;"> Asynchronous event sequence clustering aims to group similar event sequences in an unsupervised manner. Mixture models of temporal point processes have been proposed to solve this problem, but they often suffer from overfitting, leading to excessive cluster generation with a lack of diversity. To overcome these limitations, we propose a Bayesian mixture model of Temporal Point Processes with Deter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04397v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04397v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04397v1-abstract-full" style="display: none;"> Asynchronous event sequence clustering aims to group similar event sequences in an unsupervised manner. Mixture models of temporal point processes have been proposed to solve this problem, but they often suffer from overfitting, leading to excessive cluster generation with a lack of diversity. To overcome these limitations, we propose a Bayesian mixture model of Temporal Point Processes with Determinantal Point Process prior (TP$^2$DP$^2$) and accordingly an efficient posterior inference algorithm based on conditional Gibbs sampling. Our work provides a flexible learning framework for event sequence clustering, enabling automatic identification of the potential number of clusters and accurate grouping of sequences with similar features. It is applicable to a wide range of parametric temporal point processes, including neural network-based models. Experimental results on both synthetic and real-world data suggest that our framework could produce moderately fewer yet more diverse mixture components, and achieve outstanding results across multiple evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04397v1-abstract-full').style.display = 'none'; document.getElementById('2411.04397v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04291">arXiv:2411.04291</a> <span> [<a href="https://arxiv.org/pdf/2411.04291">pdf</a>, <a href="https://arxiv.org/format/2411.04291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unfair Alignment: Examining Safety Alignment Across Vision Encoder Layers in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bachu%2C+S">Saketh Bachu</a>, <a href="/search/cs?searchtype=author&query=Shayegani%2C+E">Erfan Shayegani</a>, <a href="/search/cs?searchtype=author&query=Chakraborty%2C+T">Trishna Chakraborty</a>, <a href="/search/cs?searchtype=author&query=Lal%2C+R">Rohit Lal</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+A">Arindam Dutta</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Chengyu Song</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a>, <a href="/search/cs?searchtype=author&query=Abu-Ghazaleh%2C+N">Nael Abu-Ghazaleh</a>, <a href="/search/cs?searchtype=author&query=Roy-Chowdhury%2C+A+K">Amit K. Roy-Chowdhury</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04291v1-abstract-short" style="display: inline;"> Vision-language models (VLMs) have improved significantly in multi-modal tasks, but their more complex architecture makes their safety alignment more challenging than the alignment of large language models (LLMs). In this paper, we reveal an unfair distribution of safety across the layers of VLM's vision encoder, with earlier and middle layers being disproportionately vulnerable to malicious input… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04291v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04291v1-abstract-full" style="display: none;"> Vision-language models (VLMs) have improved significantly in multi-modal tasks, but their more complex architecture makes their safety alignment more challenging than the alignment of large language models (LLMs). In this paper, we reveal an unfair distribution of safety across the layers of VLM's vision encoder, with earlier and middle layers being disproportionately vulnerable to malicious inputs compared to the more robust final layers. This 'cross-layer' vulnerability stems from the model's inability to generalize its safety training from the default architectural settings used during training to unseen or out-of-distribution scenarios, leaving certain layers exposed. We conduct a comprehensive analysis by projecting activations from various intermediate layers and demonstrate that these layers are more likely to generate harmful outputs when exposed to malicious inputs. Our experiments with LLaVA-1.5 and Llama 3.2 show discrepancies in attack success rates and toxicity scores across layers, indicating that current safety alignment strategies focused on a single default layer are insufficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04291v1-abstract-full').style.display = 'none'; document.getElementById('2411.04291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03572">arXiv:2411.03572</a> <span> [<a href="https://arxiv.org/pdf/2411.03572">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Advanced RAG Models with Graph Structures: Optimizing Complex Knowledge Reasoning and Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxin Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hongye Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chihang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03572v1-abstract-short" style="display: inline;"> This study aims to optimize the existing retrieval-augmented generation model (RAG) by introducing a graph structure to improve the performance of the model in dealing with complex knowledge reasoning tasks. The traditional RAG model has the problem of insufficient processing efficiency when facing complex graph structure information (such as knowledge graphs, hierarchical relationships, etc.), wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03572v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03572v1-abstract-full" style="display: none;"> This study aims to optimize the existing retrieval-augmented generation model (RAG) by introducing a graph structure to improve the performance of the model in dealing with complex knowledge reasoning tasks. The traditional RAG model has the problem of insufficient processing efficiency when facing complex graph structure information (such as knowledge graphs, hierarchical relationships, etc.), which affects the quality and consistency of the generated results. This study proposes a scheme to process graph structure data by combining graph neural network (GNN), so that the model can capture the complex relationship between entities, thereby improving the knowledge consistency and reasoning ability of the generated text. The experiment used the Natural Questions (NQ) dataset and compared it with multiple existing generation models. The results show that the graph-based RAG model proposed in this paper is superior to the traditional generation model in terms of quality, knowledge consistency, and reasoning ability, especially when dealing with tasks that require multi-dimensional reasoning. Through the combination of the enhancement of the retrieval module and the graph neural network, the model in this study can better handle complex knowledge background information and has broad potential value in multiple practical application scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03572v1-abstract-full').style.display = 'none'; document.getElementById('2411.03572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03360">arXiv:2411.03360</a> <span> [<a href="https://arxiv.org/pdf/2411.03360">pdf</a>, <a href="https://arxiv.org/format/2411.03360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Pedestrian Volume Prediction Using a Diffusion Convolutional Gated Recurrent Unit Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiwei Dong</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+T">Tingjin Chu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lele Zhang</a>, <a href="/search/cs?searchtype=author&query=Ghaderi%2C+H">Hadi Ghaderi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanfang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03360v1-abstract-short" style="display: inline;"> Effective models for analysing and predicting pedestrian flow are important to ensure the safety of both pedestrians and other road users. These tools also play a key role in optimising infrastructure design and geometry and supporting the economic utility of interconnected communities. The implementation of city-wide automatic pedestrian counting systems provides researchers with invaluable data,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03360v1-abstract-full" style="display: none;"> Effective models for analysing and predicting pedestrian flow are important to ensure the safety of both pedestrians and other road users. These tools also play a key role in optimising infrastructure design and geometry and supporting the economic utility of interconnected communities. The implementation of city-wide automatic pedestrian counting systems provides researchers with invaluable data, enabling the development and training of deep learning applications that offer better insights into traffic and crowd flows. Benefiting from real-world data provided by the City of Melbourne pedestrian counting system, this study presents a pedestrian flow prediction model, as an extension of Diffusion Convolutional Grated Recurrent Unit (DCGRU) with dynamic time warping, named DCGRU-DTW. This model captures the spatial dependencies of pedestrian flow through the diffusion process and the temporal dependency captured by Gated Recurrent Unit (GRU). Through extensive numerical experiments, we demonstrate that the proposed model outperforms the classic vector autoregressive model and the original DCGRU across multiple model accuracy metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03360v1-abstract-full').style.display = 'none'; document.getElementById('2411.03360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02608">arXiv:2411.02608</a> <span> [<a href="https://arxiv.org/pdf/2411.02608">pdf</a>, <a href="https://arxiv.org/format/2411.02608">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SSFold: Learning to Fold Arbitrary Crumpled Cloth Using Graph Dynamics from Human Demonstration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Changshi Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haichuan Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiarui Hu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Feng Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yanchao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanmin Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bin He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02608v1-abstract-short" style="display: inline;"> Robotic cloth manipulation faces challenges due to the fabric's complex dynamics and the high dimensionality of configuration spaces. Previous methods have largely focused on isolated smoothing or folding tasks and overly reliant on simulations, often failing to bridge the significant sim-to-real gap in deformable object manipulation. To overcome these challenges, we propose a two-stream architect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02608v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02608v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02608v1-abstract-full" style="display: none;"> Robotic cloth manipulation faces challenges due to the fabric's complex dynamics and the high dimensionality of configuration spaces. Previous methods have largely focused on isolated smoothing or folding tasks and overly reliant on simulations, often failing to bridge the significant sim-to-real gap in deformable object manipulation. To overcome these challenges, we propose a two-stream architecture with sequential and spatial pathways, unifying smoothing and folding tasks into a single adaptable policy model that accommodates various cloth types and states. The sequential stream determines the pick and place positions for the cloth, while the spatial stream, using a connectivity dynamics model, constructs a visibility graph from partial point cloud data of the self-occluded cloth, allowing the robot to infer the cloth's full configuration from incomplete observations. To bridge the sim-to-real gap, we utilize a hand tracking detection algorithm to gather and integrate human demonstration data into our novel end-to-end neural network, improving real-world adaptability. Our method, validated on a UR5 robot across four distinct cloth folding tasks with different goal shapes, consistently achieves folded states from arbitrary crumpled initial configurations, with success rates of 99\%, 99\%, 83\%, and 67\%. It outperforms existing state-of-the-art cloth manipulation techniques and demonstrates strong generalization to unseen cloth with diverse colors, shapes, and stiffness in real-world experiments.Videos and source code are available at: https://zcswdt.github.io/SSFold/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02608v1-abstract-full').style.display = 'none'; document.getElementById('2411.02608v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02337">arXiv:2411.02337</a> <span> [<a href="https://arxiv.org/pdf/2411.02337">pdf</a>, <a href="https://arxiv.org/format/2411.02337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WebRL: Training LLM Web Agents via Self-Evolving Online Curriculum Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zehan Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Iong%2C+I+L">Iat Long Iong</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanyu Lai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xueqiao Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyue Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiadai Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+S">Shuntian Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wei Xu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02337v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable potential as autonomous agents, particularly in web-based tasks. However, existing LLM web agents heavily rely on expensive proprietary LLM APIs, while open LLMs lack the necessary decision-making capabilities. This paper introduces WebRL, a self-evolving online curriculum reinforcement learning framework designed to train high-performance web age… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02337v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02337v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable potential as autonomous agents, particularly in web-based tasks. However, existing LLM web agents heavily rely on expensive proprietary LLM APIs, while open LLMs lack the necessary decision-making capabilities. This paper introduces WebRL, a self-evolving online curriculum reinforcement learning framework designed to train high-performance web agents using open LLMs. WebRL addresses three key challenges in building LLM web agents, including the scarcity of training tasks, sparse feedback signals, and policy distribution drift in online learning. Specifically, WebRL incorporates 1) a self-evolving curriculum that generates new tasks from unsuccessful attempts, 2) a robust outcome-supervised reward model (ORM), and 3) adaptive reinforcement learning strategies to ensure consistent improvements. We apply WebRL to transform open Llama-3.1 and GLM-4 models into proficient web agents. On WebArena-Lite, WebRL improves the success rate of Llama-3.1-8B from 4.8% to 42.4%, and from 6.1% to 43% for GLM-4-9B. These open models significantly surpass the performance of GPT-4-Turbo (17.6%) and GPT-4o (13.9%) and outperform previous state-of-the-art web agents trained on open LLMs (AutoWebGLM, 18.2%). Our findings demonstrate WebRL's effectiveness in bridging the gap between open and proprietary LLM-based web agents, paving the way for more accessible and powerful autonomous web interaction systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02337v1-abstract-full').style.display = 'none'; document.getElementById('2411.02337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01893">arXiv:2411.01893</a> <span> [<a href="https://arxiv.org/pdf/2411.01893">pdf</a>, <a href="https://arxiv.org/format/2411.01893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Global Depth-Range-Free Multi-View Stereo Transformer Network with Pose Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yitong Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijin Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhaoyang Huang</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+W">Weikang Bian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingbo Liu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01893v1-abstract-short" style="display: inline;"> In this paper, we propose a novel multi-view stereo (MVS) framework that gets rid of the depth range prior. Unlike recent prior-free MVS methods that work in a pair-wise manner, our method simultaneously considers all the source images. Specifically, we introduce a Multi-view Disparity Attention (MDA) module to aggregate long-range context information within and across multi-view images. Consideri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01893v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01893v1-abstract-full" style="display: none;"> In this paper, we propose a novel multi-view stereo (MVS) framework that gets rid of the depth range prior. Unlike recent prior-free MVS methods that work in a pair-wise manner, our method simultaneously considers all the source images. Specifically, we introduce a Multi-view Disparity Attention (MDA) module to aggregate long-range context information within and across multi-view images. Considering the asymmetry of the epipolar disparity flow, the key to our method lies in accurately modeling multi-view geometric constraints. We integrate pose embedding to encapsulate information such as multi-view camera poses, providing implicit geometric constraints for multi-view disparity feature fusion dominated by attention. Additionally, we construct corresponding hidden states for each source image due to significant differences in the observation quality of the same pixel in the reference frame across multiple source frames. We explicitly estimate the quality of the current pixel corresponding to sampled points on the epipolar line of the source image and dynamically update hidden states through the uncertainty estimation module. Extensive results on the DTU dataset and Tanks&Temple benchmark demonstrate the effectiveness of our method. The code is available at our project page: https://zju3dv.github.io/GD-PoseMVS/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01893v1-abstract-full').style.display = 'none'; document.getElementById('2411.01893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01603">arXiv:2411.01603</a> <span> [<a href="https://arxiv.org/pdf/2411.01603">pdf</a>, <a href="https://arxiv.org/format/2411.01603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An Aerial Transport System in Marine GNSS-Denied Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianjun Sun</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Z">Zhenwei Niu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fenglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Din%2C+M+U">Muhayy Ud Din</a>, <a href="/search/cs?searchtype=author&query=Seneviratne%2C+L">Lakmal Seneviratne</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Defu Lin</a>, <a href="/search/cs?searchtype=author&query=Hussain%2C+I">Irfan Hussain</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shaoming He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01603v1-abstract-short" style="display: inline;"> This paper presents an autonomous aerial system specifically engineered for operation in challenging marine GNSS-denied environments, aimed at transporting small cargo from a target vessel. In these environments, characterized by weakly textured sea surfaces with few feature points, chaotic deck oscillations due to waves, and significant wind gusts, conventional navigation methods often prove inad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01603v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01603v1-abstract-full" style="display: none;"> This paper presents an autonomous aerial system specifically engineered for operation in challenging marine GNSS-denied environments, aimed at transporting small cargo from a target vessel. In these environments, characterized by weakly textured sea surfaces with few feature points, chaotic deck oscillations due to waves, and significant wind gusts, conventional navigation methods often prove inadequate. Leveraging the DJI M300 platform, our system is designed to autonomously navigate and transport cargo while overcoming these environmental challenges. In particular, this paper proposes an anchor-based localization method using ultrawideband (UWB) and QR codes facilities, which decouples the UAV's attitude from that of the moving landing platform, thus reducing control oscillations caused by platform movement. Additionally, a motor-driven attachment mechanism for cargo is designed, which enhances the UAV's field of view during descent and ensures a reliable attachment to the cargo upon landing. The system's reliability and effectiveness were progressively enhanced through multiple outdoor experimental iterations and were validated by the successful cargo transport during the 2024 Mohamed BinZayed International Robotics Challenge (MBZIRC2024) competition. Crucially, the system addresses uncertainties and interferences inherent in maritime transportation missions without prior knowledge of cargo locations on the deck and with strict limitations on intervention throughout the transportation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01603v1-abstract-full').style.display = 'none'; document.getElementById('2411.01603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01602">arXiv:2411.01602</a> <span> [<a href="https://arxiv.org/pdf/2411.01602">pdf</a>, <a href="https://arxiv.org/format/2411.01602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DreamPolish: Domain Score Distillation With Progressive Geometry Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yean Cheng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Ziqi Cai</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wendi Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Boxin Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01602v1-abstract-short" style="display: inline;"> We introduce DreamPolish, a text-to-3D generation model that excels in producing refined geometry and high-quality textures. In the geometry construction phase, our approach leverages multiple neural representations to enhance the stability of the synthesis process. Instead of relying solely on a view-conditioned diffusion prior in the novel sampled views, which often leads to undesired artifacts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01602v1-abstract-full" style="display: none;"> We introduce DreamPolish, a text-to-3D generation model that excels in producing refined geometry and high-quality textures. In the geometry construction phase, our approach leverages multiple neural representations to enhance the stability of the synthesis process. Instead of relying solely on a view-conditioned diffusion prior in the novel sampled views, which often leads to undesired artifacts in the geometric surface, we incorporate an additional normal estimator to polish the geometry details, conditioned on viewpoints with varying field-of-views. We propose to add a surface polishing stage with only a few training steps, which can effectively refine the artifacts attributed to limited guidance from previous stages and produce 3D objects with more desirable geometry. The key topic of texture generation using pretrained text-to-image models is to find a suitable domain in the vast latent distribution of these models that contains photorealistic and consistent renderings. In the texture generation phase, we introduce a novel score distillation objective, namely domain score distillation (DSD), to guide neural representations toward such a domain. We draw inspiration from the classifier-free guidance (CFG) in textconditioned image generation tasks and show that CFG and variational distribution guidance represent distinct aspects in gradient guidance and are both imperative domains for the enhancement of texture quality. Extensive experiments show our proposed model can produce 3D assets with polished surfaces and photorealistic textures, outperforming existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01602v1-abstract-full').style.display = 'none'; document.getElementById('2411.01602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00915">arXiv:2411.00915</a> <span> [<a href="https://arxiv.org/pdf/2411.00915">pdf</a>, <a href="https://arxiv.org/format/2411.00915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> V-LoRA: An Efficient and Flexible System Boosts Vision Applications with LoRA LMM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mi%2C+L">Liang Mi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weijun Wang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+W">Wenming Tu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qingfeng He</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+R">Rui Kong</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yazhu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yikang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunchun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Haipeng Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guihai Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunxin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00915v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have shown significant progress in various complex vision tasks with the solid linguistic and reasoning capacity inherited from large language models (LMMs). Low-rank adaptation (LoRA) offers a promising method to integrate external knowledge into LMMs, compensating for their limitations on domain-specific tasks. However, the existing LoRA model serving is excessivel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00915v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00915v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00915v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have shown significant progress in various complex vision tasks with the solid linguistic and reasoning capacity inherited from large language models (LMMs). Low-rank adaptation (LoRA) offers a promising method to integrate external knowledge into LMMs, compensating for their limitations on domain-specific tasks. However, the existing LoRA model serving is excessively computationally expensive and causes extremely high latency. In this paper, we present an end-to-end solution that empowers diverse vision tasks and enriches vision applications with LoRA LMMs. Our system, VaLoRA, enables accurate and efficient vision tasks by 1) an accuracy-aware LoRA adapter generation approach that generates LoRA adapters rich in domain-specific knowledge to meet application-specific accuracy requirements, 2) an adaptive-tiling LoRA adapters batching operator that efficiently computes concurrent heterogeneous LoRA adapters, and 3) a flexible LoRA adapter orchestration mechanism that manages application requests and LoRA adapters to achieve the lowest average response latency. We prototype VaLoRA on five popular vision tasks on three LMMs. Experiment results reveal that VaLoRA improves 24-62% of the accuracy compared to the original LMMs and reduces 20-89% of the latency compared to the state-of-the-art LoRA model serving systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00915v1-abstract-full').style.display = 'none'; document.getElementById('2411.00915v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00820">arXiv:2411.00820</a> <span> [<a href="https://arxiv.org/pdf/2411.00820">pdf</a>, <a href="https://arxiv.org/format/2411.00820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AutoGLM: Autonomous Foundation Agents for GUIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Bo Qin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dongzhu Liang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+G">Guang Dong</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanyu Lai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Iong%2C+I+L">Iat Long Iong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiadai Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junjie Gao</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Junjun Shan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kangning Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shudan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+S">Shuntian Yao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Siyi Cheng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wentao Yao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinghan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinying Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyue Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00820v1-abstract-short" style="display: inline;"> We present AutoGLM, a new series in the ChatGLM family, designed to serve as foundation agents for autonomous control of digital devices through Graphical User Interfaces (GUIs). While foundation models excel at acquiring human knowledge, they often struggle with decision-making in dynamic real-world environments, limiting their progress toward artificial general intelligence. This limitation unde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00820v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00820v1-abstract-full" style="display: none;"> We present AutoGLM, a new series in the ChatGLM family, designed to serve as foundation agents for autonomous control of digital devices through Graphical User Interfaces (GUIs). While foundation models excel at acquiring human knowledge, they often struggle with decision-making in dynamic real-world environments, limiting their progress toward artificial general intelligence. This limitation underscores the importance of developing foundation agents capable of learning through autonomous environmental interactions by reinforcing existing models. Focusing on Web Browser and Phone as representative GUI scenarios, we have developed AutoGLM as a practical foundation agent system for real-world GUI interactions. Our approach integrates a comprehensive suite of techniques and infrastructures to create deployable agent systems suitable for user delivery. Through this development, we have derived two key insights: First, the design of an appropriate "intermediate interface" for GUI control is crucial, enabling the separation of planning and grounding behaviors, which require distinct optimization for flexibility and accuracy respectively. Second, we have developed a novel progressive training framework that enables self-evolving online curriculum reinforcement learning for AutoGLM. Our evaluations demonstrate AutoGLM's effectiveness across multiple domains. For web browsing, AutoGLM achieves a 55.2% success rate on VAB-WebArena-Lite (improving to 59.1% with a second attempt) and 96.2% on OpenTable evaluation tasks. In Android device control, AutoGLM attains a 36.2% success rate on AndroidLab (VAB-Mobile) and 89.7% on common tasks in popular Chinese APPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00820v1-abstract-full').style.display = 'none'; document.getElementById('2411.00820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00026">arXiv:2411.00026</a> <span> [<a href="https://arxiv.org/pdf/2411.00026">pdf</a>, <a href="https://arxiv.org/format/2411.00026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Assumptions Ordering in CAR-Based Model Checking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yibo Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianwen Li</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+G">Geguang Pu</a>, <a href="/search/cs?searchtype=author&query=Strichman%2C+O">Ofer Strichman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00026v1-abstract-short" style="display: inline;"> Model checking is an automatic formal verification technique that is widely used in hardware verification. The state-of-the-art complete model-checking techniques, based on IC3/PDR and its general variant CAR, are based on computing symbolically sets of under - and over-approximating state sets (called frames) with multiple calls to a SAT solver. The performance of those techniques is sensitive to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00026v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00026v1-abstract-full" style="display: none;"> Model checking is an automatic formal verification technique that is widely used in hardware verification. The state-of-the-art complete model-checking techniques, based on IC3/PDR and its general variant CAR, are based on computing symbolically sets of under - and over-approximating state sets (called frames) with multiple calls to a SAT solver. The performance of those techniques is sensitive to the order of the assumptions with which the SAT solver is invoked, because it affects the unsatisfiable cores - which the solver emits when the formula is unsatisfiable - that crucially affect the search process. This observation was previously published in [15], where two partial assumption ordering strategies, intersection and rotation were suggested (partial in the sense that they determine the order of only a subset of the literals). In this paper we extend and improve these strategies based on an analysis of the reason for their effectiveness. We prove that intersection is effective because of what we call locality of the cores, and our improved strategy is based on this observation. We conclude our paper with an extensive empirical evaluation of the various ordering techniques. One of our strategies, Hybrid-CAR, which switches between strategies at runtime, not only outperforms other, fixed ordering strategies, but also outperforms other state-of-the-art bug-finding algorithms such as ABC-BMC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00026v1-abstract-full').style.display = 'none'; document.getElementById('2411.00026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24024">arXiv:2410.24024</a> <span> [<a href="https://arxiv.org/pdf/2410.24024">pdf</a>, <a href="https://arxiv.org/format/2410.24024">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AndroidLab: Training and Systematic Benchmarking of Android Autonomous Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xueqiao Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Siyi Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanyu Lai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shudan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24024v2-abstract-short" style="display: inline;"> Autonomous agents have become increasingly important for interacting with the real world. Android agents, in particular, have been recently a frequently-mentioned interaction method. However, existing studies for training and evaluating Android agents lack systematic research on both open-source and closed-source models. In this work, we propose AndroidLab as a systematic Android agent framework.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24024v2-abstract-full').style.display = 'inline'; document.getElementById('2410.24024v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24024v2-abstract-full" style="display: none;"> Autonomous agents have become increasingly important for interacting with the real world. Android agents, in particular, have been recently a frequently-mentioned interaction method. However, existing studies for training and evaluating Android agents lack systematic research on both open-source and closed-source models. In this work, we propose AndroidLab as a systematic Android agent framework. It includes an operation environment with different modalities, action space, and a reproducible benchmark. It supports both large language models (LLMs) and multimodal models (LMMs) in the same action space. AndroidLab benchmark includes predefined Android virtual devices and 138 tasks across nine apps built on these devices. By using the AndroidLab environment, we develop an Android Instruction dataset and train six open-source LLMs and LMMs, lifting the average success rates from 4.59% to 21.50% for LLMs and from 1.93% to 13.28% for LMMs. AndroidLab is open-sourced and publicly available at https://github.com/THUDM/Android-Lab. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24024v2-abstract-full').style.display = 'none'; document.getElementById('2410.24024v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22922">arXiv:2410.22922</a> <span> [<a href="https://arxiv.org/pdf/2410.22922">pdf</a>, <a href="https://arxiv.org/format/2410.22922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> High-Fidelity Document Stain Removal via A Large-Scale Real-World Dataset and A Memory-Augmented Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxian Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yingtie Lei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihang Dong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yilin Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zimeng Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuhang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22922v1-abstract-short" style="display: inline;"> Document images are often degraded by various stains, significantly impacting their readability and hindering downstream applications such as document digitization and analysis. The absence of a comprehensive stained document dataset has limited the effectiveness of existing document enhancement methods in removing stains while preserving fine-grained details. To address this challenge, we constru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22922v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22922v1-abstract-full" style="display: none;"> Document images are often degraded by various stains, significantly impacting their readability and hindering downstream applications such as document digitization and analysis. The absence of a comprehensive stained document dataset has limited the effectiveness of existing document enhancement methods in removing stains while preserving fine-grained details. To address this challenge, we construct StainDoc, the first large-scale, high-resolution ($2145\times2245$) dataset specifically designed for document stain removal. StainDoc comprises over 5,000 pairs of stained and clean document images across multiple scenes. This dataset encompasses a diverse range of stain types, severities, and document backgrounds, facilitating robust training and evaluation of document stain removal algorithms. Furthermore, we propose StainRestorer, a Transformer-based document stain removal approach. StainRestorer employs a memory-augmented Transformer architecture that captures hierarchical stain representations at part, instance, and semantic levels via the DocMemory module. The Stain Removal Transformer (SRTransformer) leverages these feature representations through a dual attention mechanism: an enhanced spatial attention with an expanded receptive field, and a channel attention captures channel-wise feature importance. This combination enables precise stain removal while preserving document content integrity. Extensive experiments demonstrate StainRestorer's superior performance over state-of-the-art methods on the StainDoc dataset and its variants StainDoc\_Mark and StainDoc\_Seal, establishing a new benchmark for document stain removal. Our work highlights the potential of memory-augmented Transformers for this task and contributes a valuable dataset to advance future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22922v1-abstract-full').style.display = 'none'; document.getElementById('2410.22922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22821">arXiv:2410.22821</a> <span> [<a href="https://arxiv.org/pdf/2410.22821">pdf</a>, <a href="https://arxiv.org/format/2410.22821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binhua Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22821v1-abstract-short" style="display: inline;"> How to evaluate Large Language Models (LLMs) in code generation remains an open question. Existing benchmarks have two limitations - data leakage and lack of domain-specific evaluation. The former hurts the fairness of benchmarks, and the latter hinders practitioners from selecting superior LLMs for specific programming domains. To address these two limitations, we propose a new benchmark - EvoCod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22821v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22821v1-abstract-full" style="display: none;"> How to evaluate Large Language Models (LLMs) in code generation remains an open question. Existing benchmarks have two limitations - data leakage and lack of domain-specific evaluation. The former hurts the fairness of benchmarks, and the latter hinders practitioners from selecting superior LLMs for specific programming domains. To address these two limitations, we propose a new benchmark - EvoCodeBench, which has the following advances: (1) Evolving data. EvoCodeBench will be dynamically updated every period (e.g., 6 months) to avoid data leakage. This paper releases the first version - EvoCodeBench-2403, containing 275 samples from 25 repositories. (2) A domain taxonomy and domain labels. Based on the statistics of open-source communities, we design a programming domain taxonomy consisting of 10 popular domains. Based on the taxonomy, we annotate each sample in EvoCodeBench with a domain label. (3) Domain-specific evaluations. Besides the Pass@k, we compute the Domain-Specific Improvement (DSI) and define LLMs' comfort and strange domains. These evaluations help practitioners select superior LLMs in specific domains and discover the shortcomings of existing LLMs. We evaluate 8 popular LLMs (e.g., gpt-4, DeepSeek Coder) on EvoCodeBench and summarize some insights. EvoCodeBench reveals the actual abilities of these LLMs in real-world repositories. For example, the highest Pass@1 of gpt-4 on EvoCodeBench-2403 is only 20.74%. Besides, we evaluate LLMs in different domains and discover their comfort and strange domains. For example, gpt-4 performs best in most domains but falls behind others in the Internet domain. StarCoder 2-15B unexpectedly performs well in the Database domain and even outperforms 33B LLMs. EvoCodeBench has been released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22821v1-abstract-full').style.display = 'none'; document.getElementById('2410.22821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21909">arXiv:2410.21909</a> <span> [<a href="https://arxiv.org/pdf/2410.21909">pdf</a>, <a href="https://arxiv.org/format/2410.21909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> SceneGenAgent: Precise Industrial Scene Generation with Coding Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiao Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zibo Liao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhenyu Hou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianrui Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Ling Fu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21909v1-abstract-short" style="display: inline;"> The modeling of industrial scenes is essential for simulations in industrial manufacturing. While large language models (LLMs) have shown significant progress in generating general 3D scenes from textual descriptions, generating industrial scenes with LLMs poses a unique challenge due to their demand for precise measurements and positioning, requiring complex planning over spatial arrangement. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21909v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21909v1-abstract-full" style="display: none;"> The modeling of industrial scenes is essential for simulations in industrial manufacturing. While large language models (LLMs) have shown significant progress in generating general 3D scenes from textual descriptions, generating industrial scenes with LLMs poses a unique challenge due to their demand for precise measurements and positioning, requiring complex planning over spatial arrangement. To address this challenge, we introduce SceneGenAgent, an LLM-based agent for generating industrial scenes through C# code. SceneGenAgent ensures precise layout planning through a structured and calculable format, layout verification, and iterative refinement to meet the quantitative requirements of industrial scenarios. Experiment results demonstrate that LLMs powered by SceneGenAgent exceed their original performance, reaching up to 81.0% success rate in real-world industrial scene generation tasks and effectively meeting most scene generation requirements. To further enhance accessibility, we construct SceneInstruct, a dataset designed for fine-tuning open-source LLMs to integrate into SceneGenAgent. Experiments show that fine-tuning open-source LLMs on SceneInstruct yields significant performance improvements, with Llama3.1-70B approaching the capabilities of GPT-4o. Our code and data are available at https://github.com/THUDM/SceneGenAgent . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21909v1-abstract-full').style.display = 'none'; document.getElementById('2410.21909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21252">arXiv:2410.21252</a> <span> [<a href="https://arxiv.org/pdf/2410.21252">pdf</a>, <a href="https://arxiv.org/format/2410.21252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LongReward: Improving Long-context Large Language Models with AI Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajie Zhang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhongni Hou</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xin Lv</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shulin Cao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhenyu Hou</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yilin Niu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Ling Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juanzi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21252v1-abstract-short" style="display: inline;"> Though significant advancements have been achieved in developing long-context large language models (LLMs), the compromised quality of LLM-synthesized data for supervised fine-tuning (SFT) often affects the long-context performance of SFT models and leads to inherent limitations. In principle, reinforcement learning (RL) with appropriate reward signals can further enhance models' capacities. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21252v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21252v1-abstract-full" style="display: none;"> Though significant advancements have been achieved in developing long-context large language models (LLMs), the compromised quality of LLM-synthesized data for supervised fine-tuning (SFT) often affects the long-context performance of SFT models and leads to inherent limitations. In principle, reinforcement learning (RL) with appropriate reward signals can further enhance models' capacities. However, how to obtain reliable rewards in long-context scenarios remains unexplored. To this end, we propose LongReward, a novel method that utilizes an off-the-shelf LLM to provide rewards for long-context model responses from four human-valued dimensions: helpfulness, logicality, faithfulness, and completeness, each with a carefully designed assessment pipeline. By combining LongReward and offline RL algorithm DPO, we are able to effectively improve long-context SFT models. Our experiments indicate that LongReward not only significantly improves models' long-context performance but also enhances their ability to follow short instructions. We also find that long-context DPO with LongReward and conventional short-context DPO can be used together without hurting either one's performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21252v1-abstract-full').style.display = 'none'; document.getElementById('2410.21252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19258">arXiv:2410.19258</a> <span> [<a href="https://arxiv.org/pdf/2410.19258">pdf</a>, <a href="https://arxiv.org/format/2410.19258">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Not All Heads Matter: A Head-Level KV Cache Compression Method with Integrated Retrieval and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yu Fu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Asi%2C+A">Abedelkadir Asi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19258v3-abstract-short" style="display: inline;"> Key-Value (KV) caching is a common technique to enhance the computational efficiency of Large Language Models (LLMs), but its memory overhead grows rapidly with input length. Prior work has shown that not all tokens are equally important for text generation, proposing layer-level KV cache compression to selectively retain key information. Recognizing the distinct roles of attention heads in genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19258v3-abstract-full').style.display = 'inline'; document.getElementById('2410.19258v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19258v3-abstract-full" style="display: none;"> Key-Value (KV) caching is a common technique to enhance the computational efficiency of Large Language Models (LLMs), but its memory overhead grows rapidly with input length. Prior work has shown that not all tokens are equally important for text generation, proposing layer-level KV cache compression to selectively retain key information. Recognizing the distinct roles of attention heads in generation, we propose HeadKV, a head-level KV cache compression method, and HeadKV-R2, which leverages a novel contextual reasoning ability estimation for compression. Our approach operates at the level of individual heads, estimating their importance for contextual QA tasks that require both retrieval and reasoning capabilities. Extensive experiments across diverse benchmarks (LongBench, LooGLE), model architectures (e.g., Llama-3-8B-Instruct, Mistral-7B-Instruct), and long-context abilities tests demonstrate that our head-level KV cache compression significantly outperforms strong baselines, particularly in low-resource settings (KV size = 64 & 128). Notably, our method retains just 1.5% of the KV cache while achieving 97% of the performance of the full KV cache on the contextual question answering benchmark.Codes are available at https://github.com/FYYFU/HeadKV <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19258v3-abstract-full').style.display = 'none'; document.getElementById('2410.19258v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18050">arXiv:2410.18050</a> <span> [<a href="https://arxiv.org/pdf/2410.18050">pdf</a>, <a href="https://arxiv.org/format/2410.18050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LongRAG: A Dual-Perspective Retrieval-Augmented Generation Paradigm for Long-Context Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruobing Wang</a>, <a href="/search/cs?searchtype=author&query=Cen%2C+Y">Yukuo Cen</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+D">Daren Zha</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Shicheng Tan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18050v2-abstract-short" style="display: inline;"> Long-Context Question Answering (LCQA), a challenging task, aims to reason over long-context documents to yield accurate answers to questions. Existing long-context Large Language Models (LLMs) for LCQA often struggle with the "lost in the middle" issue. Retrieval-Augmented Generation (RAG) mitigates this issue by providing external factual evidence. However, its chunking strategy disrupts the glo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18050v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18050v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18050v2-abstract-full" style="display: none;"> Long-Context Question Answering (LCQA), a challenging task, aims to reason over long-context documents to yield accurate answers to questions. Existing long-context Large Language Models (LLMs) for LCQA often struggle with the "lost in the middle" issue. Retrieval-Augmented Generation (RAG) mitigates this issue by providing external factual evidence. However, its chunking strategy disrupts the global long-context information, and its low-quality retrieval in long contexts hinders LLMs from identifying effective factual details due to substantial noise. To this end, we propose LongRAG, a general, dual-perspective, and robust LLM-based RAG system paradigm for LCQA to enhance RAG's understanding of complex long-context knowledge (i.e., global information and factual details). We design LongRAG as a plug-and-play paradigm, facilitating adaptation to various domains and LLMs. Extensive experiments on three multi-hop datasets demonstrate that LongRAG significantly outperforms long-context LLMs (up by 6.94%), advanced RAG (up by 6.16%), and Vanilla RAG (up by 17.25%). Furthermore, we conduct quantitative ablation studies and multi-dimensional analyses, highlighting the effectiveness of the system's components and fine-tuning strategies. Data and code are available at https://github.com/QingFei1/LongRAG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18050v2-abstract-full').style.display = 'none'; document.getElementById('2410.18050v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Main, Final</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15444">arXiv:2410.15444</a> <span> [<a href="https://arxiv.org/pdf/2410.15444">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MDFI-Net: Multiscale Differential Feature Interaction Network for Accurate Retinal Vessel Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiwang Dong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiangyu Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15444v1-abstract-short" style="display: inline;"> The accurate segmentation of retinal vessels in fundus images is a great challenge in medical image segmentation tasks due to their highly complex structure from other organs.Currently, deep-learning based methods for retinal cessel segmentation achieved suboptimal outcoms,since vessels with indistinct features are prone to being overlooked in deeper layers of the network. Additionally, the abunda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15444v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15444v1-abstract-full" style="display: none;"> The accurate segmentation of retinal vessels in fundus images is a great challenge in medical image segmentation tasks due to their highly complex structure from other organs.Currently, deep-learning based methods for retinal cessel segmentation achieved suboptimal outcoms,since vessels with indistinct features are prone to being overlooked in deeper layers of the network. Additionally, the abundance of redundant information in the background poses significant interference to feature extraction, thus increasing the segmentation difficulty. To address this issue, this paper proposes a feature-enhanced interaction network based on DPCN, named MDFI-Net.Specifically, we design a feature enhancement structure, the Deformable-convolutional Pulse Coupling Network (DPCN), to provide an enhanced feature iteration sequence to the segmentation network in a simple and efficient manner. Subsequently, these features will interact within the segmentation network.Extensive experiments were conducted on publicly available retinal vessel segmentation datasets to validate the effectiveness of our network structure. Experimental results of our algorithm show that the detection accuracy of the retinal blood vessel achieves 97.91%, 97.97% and 98.16% across all datasets. Finally, plentiful experimental results also prove that the proposed MDFI-Net achieves segmentation performance superior to state-of-the-art methods on public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15444v1-abstract-full').style.display = 'none'; document.getElementById('2410.15444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15165">arXiv:2410.15165</a> <span> [<a href="https://arxiv.org/pdf/2410.15165">pdf</a>, <a href="https://arxiv.org/format/2410.15165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Explaining Graph Neural Networks with Large Language Models: A Counterfactual Perspective for Molecular Property Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yinhan He</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zaiyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Soga%2C+P">Patrick Soga</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yaozhen Zhu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+y">yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15165v1-abstract-short" style="display: inline;"> In recent years, Graph Neural Networks (GNNs) have become successful in molecular property prediction tasks such as toxicity analysis. However, due to the black-box nature of GNNs, their outputs can be concerning in high-stakes decision-making scenarios, e.g., drug discovery. Facing such an issue, Graph Counterfactual Explanation (GCE) has emerged as a promising approach to improve GNN transparenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15165v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15165v1-abstract-full" style="display: none;"> In recent years, Graph Neural Networks (GNNs) have become successful in molecular property prediction tasks such as toxicity analysis. However, due to the black-box nature of GNNs, their outputs can be concerning in high-stakes decision-making scenarios, e.g., drug discovery. Facing such an issue, Graph Counterfactual Explanation (GCE) has emerged as a promising approach to improve GNN transparency. However, current GCE methods usually fail to take domain-specific knowledge into consideration, which can result in outputs that are not easily comprehensible by humans. To address this challenge, we propose a novel GCE method, LLM-GCE, to unleash the power of large language models (LLMs) in explaining GNNs for molecular property prediction. Specifically, we utilize an autoencoder to generate the counterfactual graph topology from a set of counterfactual text pairs (CTPs) based on an input graph. Meanwhile, we also incorporate a CTP dynamic feedback module to mitigate LLM hallucination, which provides intermediate feedback derived from the generated counterfactuals as an attempt to give more faithful guidance. Extensive experiments demonstrate the superior performance of LLM-GCE. Our code is released on https://github.com/YinhanHe123/new\_LLM4GNNExplanation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15165v1-abstract-full').style.display = 'none'; document.getElementById('2410.15165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> EMNLP 2024 (Findings) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14632">arXiv:2410.14632</a> <span> [<a href="https://arxiv.org/pdf/2410.14632">pdf</a>, <a href="https://arxiv.org/format/2410.14632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Diverging Preferences: When do Annotators Disagree and do Models Know? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M+J">Michael JQ Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhilin Wang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J+D">Jena D. Hwang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yi Dong</a>, <a href="/search/cs?searchtype=author&query=Delalleau%2C+O">Olivier Delalleau</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eunsol Choi</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiang Ren</a>, <a href="/search/cs?searchtype=author&query=Pyatkin%2C+V">Valentina Pyatkin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14632v2-abstract-short" style="display: inline;"> We examine diverging preferences in human-labeled preference datasets. We develop a taxonomy of disagreement sources spanning 10 categories across four high-level classes -- task underspecification, response style, refusals, and annotation errors. We find that the majority of disagreements are in opposition with standard reward modeling approaches, which are designed with the assumption that annot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14632v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14632v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14632v2-abstract-full" style="display: none;"> We examine diverging preferences in human-labeled preference datasets. We develop a taxonomy of disagreement sources spanning 10 categories across four high-level classes -- task underspecification, response style, refusals, and annotation errors. We find that the majority of disagreements are in opposition with standard reward modeling approaches, which are designed with the assumption that annotator disagreement is noise. We then explore how these findings impact two areas of LLM development: reward modeling and evaluation. In our experiments, we demonstrate how standard reward modeling methods, like the Bradley-Terry model, fail to differentiate whether a given preference judgment is the result of unanimous agreement among annotators or the majority opinion among diverging user preferences. We also find that these tendencies are also echoed by popular LLM-as-Judge evaluation methods, which consistently identify a winning response in cases of diverging preferences. These findings highlight remaining challenges in LLM evaluations, which are greatly influenced by divisive features like response style, and in developing pluralistically aligned LLMs. To address these issues, we develop methods for identifying diverging preferences to mitigate their influence on evaluation and training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14632v2-abstract-full').style.display = 'none'; document.getElementById('2410.14632v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14165">arXiv:2410.14165</a> <span> [<a href="https://arxiv.org/pdf/2410.14165">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Automated Genre-Aware Article Scoring and Feedback Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chihang Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxin Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruotong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajing Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14165v1-abstract-short" style="display: inline;"> This paper focuses on the development of an advanced intelligent article scoring system that not only assesses the overall quality of written work but also offers detailed feature-based scoring tailored to various article genres. By integrating the pre-trained BERT model with the large language model Chat-GPT, the system gains a deep understanding of both the content and structure of the text, ena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14165v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14165v1-abstract-full" style="display: none;"> This paper focuses on the development of an advanced intelligent article scoring system that not only assesses the overall quality of written work but also offers detailed feature-based scoring tailored to various article genres. By integrating the pre-trained BERT model with the large language model Chat-GPT, the system gains a deep understanding of both the content and structure of the text, enabling it to provide a thorough evaluation along with targeted suggestions for improvement. Experimental results demonstrate that this system outperforms traditional scoring methods across multiple public datasets, particularly in feature-based assessments, offering a more accurate reflection of the quality of different article types. Moreover, the system generates personalized feedback to assist users in enhancing their writing skills, underscoring the potential and practical value of automated scoring technologies in educational contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14165v1-abstract-full').style.display = 'none'; document.getElementById('2410.14165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13267">arXiv:2410.13267</a> <span> [<a href="https://arxiv.org/pdf/2410.13267">pdf</a>, <a href="https://arxiv.org/format/2410.13267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLaMP 2: Multimodal Music Information Retrieval Across 101 Languages Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yashan Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhancheng Guo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Monan Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+X">Xuefeng Mu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuejie Gao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuanliang Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiafeng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaobing Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13267v1-abstract-short" style="display: inline;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13267v1-abstract-full" style="display: none;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (Musical Instrument Digital Interface) for music information retrieval. CLaMP 2, pre-trained on 1.5 million ABC-MIDI-text triplets, includes a multilingual text encoder and a multimodal music encoder aligned via contrastive learning. By leveraging large language models, we obtain refined and consistent multilingual descriptions at scale, significantly reducing textual noise and balancing language distribution. Our experiments show that CLaMP 2 achieves state-of-the-art results in both multilingual semantic search and music classification across modalities, thus establishing a new standard for inclusive and global music information retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'none'; document.getElementById('2410.13267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13187">arXiv:2410.13187</a> <span> [<a href="https://arxiv.org/pdf/2410.13187">pdf</a>, <a href="https://arxiv.org/format/2410.13187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> aiXcoder-7B: A Lightweight and Effective Large Language Model for Code Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Siyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+H">He Zong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shukai Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erlu Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jiazheng Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+W">Wei Ning</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gen Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kechi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13187v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been widely used in code completion, and researchers are focusing on scaling up LLMs to improve their accuracy. However, larger LLMs will increase the response time of code completion and decrease the developers' productivity. In this paper, we propose a lightweight and effective LLM for code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B ach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13187v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13187v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13187v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have been widely used in code completion, and researchers are focusing on scaling up LLMs to improve their accuracy. However, larger LLMs will increase the response time of code completion and decrease the developers' productivity. In this paper, we propose a lightweight and effective LLM for code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B achieves higher code completion accuracy while having smaller scales (i.e., 7 billion parameters). We attribute the superiority of aiXcoder-7B to three key factors: (1) Multi-objective training. We employ three training objectives, one of which is our proposed Structured Fill-In-the-Middle (SFIM). SFIM considers the syntax structures in code and effectively improves the performance of LLMs for code. (2) Diverse data sampling strategies. They consider inter-file relationships and enhance the capability of LLMs in understanding cross-file contexts. (3) Extensive high-quality data. We establish a rigorous data collection pipeline and consume a total of 1.2 trillion unique tokens for training aiXcoder-7B. This vast volume of data enables aiXcoder-7B to learn a broad distribution of code. We evaluate aiXcoder-7B in five popular code completion benchmarks and a new benchmark collected by this paper. The results show that aiXcoder-7B outperforms the latest six LLMs with similar sizes and even surpasses four larger LLMs (e.g., StarCoder2-15B and CodeLlama-34B), positioning aiXcoder-7B as a lightweight and effective LLM for academia and industry. Finally, we summarize three valuable insights for helping practitioners train the next generations of LLMs for code. aiXcoder-7B has been open-souced and gained significant attention. As of the submission date, aiXcoder-7B has received 2,193 GitHub Stars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13187v2-abstract-full').style.display = 'none'; document.getElementById('2410.13187v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">aiXcoder-7B is available at https://github.com/aixcoder-plugin/aiXcoder-7B</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09718">arXiv:2410.09718</a> <span> [<a href="https://arxiv.org/pdf/2410.09718">pdf</a>, <a href="https://arxiv.org/format/2410.09718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Tidal Current Speed Forecasting Model based on Multiple Periodicity Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+T">Tengfei Cheng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yunxuan Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangdi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09718v1-abstract-short" style="display: inline;"> Tidal energy is one of the key components in increasing the penetration rate of renewable energy. The penetration of tidal energy in the electrical grid depends on the accuracy of tidal current speed forecasting. Modeling inaccuracies hinder forecast accuracy. Previous research has primarily used physical models to forecast tidal current speed. However, tidal current variations influenced by the o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09718v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09718v1-abstract-full" style="display: none;"> Tidal energy is one of the key components in increasing the penetration rate of renewable energy. The penetration of tidal energy in the electrical grid depends on the accuracy of tidal current speed forecasting. Modeling inaccuracies hinder forecast accuracy. Previous research has primarily used physical models to forecast tidal current speed. However, tidal current variations influenced by the orbital periods of celestial bodies make accurate physical modeling challenging. Researching the multiple periodicity of tides is crucial for accurately forecasting tidal current speed. In this article, we propose the Wavelet-Enhanced Convolutional Network (WCN) to learn multiple periodicity. The framework embeds intra-period and inter-period variations of one-dimensional tidal current data into the rows and columns of a two-dimensional tensor. Then, the two-dimensional variations of the sequence can be processed by convolutional kernels. We integrate a time-frequency analysis method into the framework to further address local periodic features. Additionally, to enhance the framework's stability, we optimize the framework's hyperparameters with the Tree-structured Parzen Estimator algorithm. The proposed framework avoids the lack of learning multiple periodicity. Compared with benchmarks, the proposed framework reduces the mean absolute error and mean square error in 10-step forecasting by, at most, 90.36% and 97.56%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09718v1-abstract-full').style.display = 'none'; document.getElementById('2410.09718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08620">arXiv:2410.08620</a> <span> [<a href="https://arxiv.org/pdf/2410.08620">pdf</a>, <a href="https://arxiv.org/format/2410.08620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Natural Language Induced Adversarial Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaopei Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+G">Guanning Zeng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yingpeng Dong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaolin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08620v1-abstract-short" style="display: inline;"> Research of adversarial attacks is important for AI security because it shows the vulnerability of deep learning models and helps to build more robust models. Adversarial attacks on images are most widely studied, which include noise-based attacks, image editing-based attacks, and latent space-based attacks. However, the adversarial examples crafted by these methods often lack sufficient semantic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08620v1-abstract-full" style="display: none;"> Research of adversarial attacks is important for AI security because it shows the vulnerability of deep learning models and helps to build more robust models. Adversarial attacks on images are most widely studied, which include noise-based attacks, image editing-based attacks, and latent space-based attacks. However, the adversarial examples crafted by these methods often lack sufficient semantic information, making it challenging for humans to understand the failure modes of deep learning models under natural conditions. To address this limitation, we propose a natural language induced adversarial image attack method. The core idea is to leverage a text-to-image model to generate adversarial images given input prompts, which are maliciously constructed to lead to misclassification for a target model. To adopt commercial text-to-image models for synthesizing more natural adversarial images, we propose an adaptive genetic algorithm (GA) for optimizing discrete adversarial prompts without requiring gradients and an adaptive word space reduction method for improving query efficiency. We further used CLIP to maintain the semantic consistency of the generated images. In our experiments, we found that some high-frequency semantic information such as "foggy", "humid", "stretching", etc. can easily cause classifier errors. This adversarial semantic information exists not only in generated images but also in photos captured in the real world. We also found that some adversarial semantic information can be transferred to unknown classification tasks. Furthermore, our attack method can transfer to different text-to-image models (e.g., Midjourney, DALL-E 3, etc.) and image classifiers. Our code is available at: https://github.com/zxp555/Natural-Language-Induced-Adversarial-Images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08620v1-abstract-full').style.display = 'none'; document.getElementById('2410.08620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Carmera-ready version. To appear in ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07968">arXiv:2410.07968</a> <span> [<a href="https://arxiv.org/pdf/2410.07968">pdf</a>, <a href="https://arxiv.org/format/2410.07968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Octopus Inspired Optimization Algorithm: Multi-Level Structures and Parallel Computing Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Longji Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiquan Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuhua Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jia Deng</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Rui He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07968v1-abstract-short" style="display: inline;"> This paper introduces a novel bionic intelligent optimisation algorithm, Octopus Inspired Optimization (OIO) algorithm, which is inspired by the neural structure of octopus, especially its hierarchical and decentralised interaction properties. By simulating the sensory, decision-making, and executive abilities of octopuses, the OIO algorithm adopts a multi-level hierarchical strategy, including te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07968v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07968v1-abstract-full" style="display: none;"> This paper introduces a novel bionic intelligent optimisation algorithm, Octopus Inspired Optimization (OIO) algorithm, which is inspired by the neural structure of octopus, especially its hierarchical and decentralised interaction properties. By simulating the sensory, decision-making, and executive abilities of octopuses, the OIO algorithm adopts a multi-level hierarchical strategy, including tentacles, suckers, individuals and groups, to achieve an effective combination of global and local search. This hierarchical design not only enhances the flexibility and efficiency of the algorithm, but also significantly improves its search efficiency and adaptability. In performance evaluations, including comparisons with existing mainstream intelligent optimisation algorithms, OIO shows faster convergence and higher accuracy, especially when dealing with multimodal functions and high-dimensional optimisation problems. This advantage is even more pronounced as the required minimum accuracy is higher, with the OIO algorithm showing an average speedup of 2.27 times that of conventional particle swarm optimisation (PSO) and 9.63 times that of differential evolution (DE) on multimodal functions. In particular, when dealing with high-dimensional optimisation problems, OIO achieves an average speed of 10.39 times that of DE, demonstrating its superior computational efficiency. In addition, the OIO algorithm also shows a reduction of about $5\%$ in CPU usage efficiency compared to PSO, which is reflected in the efficiency of CPU resource usage also shows its efficiency. These features make the OIO algorithm show great potential in complex optimisation problems, and it is especially suitable for application scenarios that require fast, efficient and robust optimisation methods, such as robot path planning, supply chain management optimisation, and energy system management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07968v1-abstract-full').style.display = 'none'; document.getElementById('2410.07968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07273">arXiv:2410.07273</a> <span> [<a href="https://arxiv.org/pdf/2410.07273">pdf</a>, <a href="https://arxiv.org/format/2410.07273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BELM: Bidirectional Explicit Linear Multi-step Sampler for Exact Inversion in Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fangyikang Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hubery Yin</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuejiang Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Huminhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanbin Zhao</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+H">Hui Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07273v1-abstract-short" style="display: inline;"> The inversion of diffusion model sampling, which aims to find the corresponding initial noise of a sample, plays a critical role in various tasks. Recently, several heuristic exact inversion samplers have been proposed to address the inexact inversion issue in a training-free manner. However, the theoretical properties of these heuristic samplers remain unknown and they often exhibit mediocre samp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07273v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07273v1-abstract-full" style="display: none;"> The inversion of diffusion model sampling, which aims to find the corresponding initial noise of a sample, plays a critical role in various tasks. Recently, several heuristic exact inversion samplers have been proposed to address the inexact inversion issue in a training-free manner. However, the theoretical properties of these heuristic samplers remain unknown and they often exhibit mediocre sampling quality. In this paper, we introduce a generic formulation, \emph{Bidirectional Explicit Linear Multi-step} (BELM) samplers, of the exact inversion samplers, which includes all previously proposed heuristic exact inversion samplers as special cases. The BELM formulation is derived from the variable-stepsize-variable-formula linear multi-step method via integrating a bidirectional explicit constraint. We highlight this bidirectional explicit constraint is the key of mathematically exact inversion. We systematically investigate the Local Truncation Error (LTE) within the BELM framework and show that the existing heuristic designs of exact inversion samplers yield sub-optimal LTE. Consequently, we propose the Optimal BELM (O-BELM) sampler through the LTE minimization approach. We conduct additional analysis to substantiate the theoretical stability and global convergence property of the proposed optimal sampler. Comprehensive experiments demonstrate our O-BELM sampler establishes the exact inversion property while achieving high-quality sampling. Additional experiments in image editing and image interpolation highlight the extensive potential of applying O-BELM in varying applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07273v1-abstract-full').style.display = 'none'; document.getElementById('2410.07273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted paper by NeurIPS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07093">arXiv:2410.07093</a> <span> [<a href="https://arxiv.org/pdf/2410.07093">pdf</a>, <a href="https://arxiv.org/format/2410.07093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LaMP: Language-Motion Pretraining for Motion Generation, Retrieval, and Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weihao Yuan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yisheng He</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lingteng Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shenhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaodong Gu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weichao Shen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuan Dong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zilong Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L+T">Laurence T. Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07093v1-abstract-short" style="display: inline;"> Language plays a vital role in the realm of human motion. Existing methods have largely depended on CLIP text embeddings for motion generation, yet they fall short in effectively aligning language and motion due to CLIP's pretraining on static image-text pairs. This work introduces LaMP, a novel Language-Motion Pretraining model, which transitions from a language-vision to a more suitable language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07093v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07093v1-abstract-full" style="display: none;"> Language plays a vital role in the realm of human motion. Existing methods have largely depended on CLIP text embeddings for motion generation, yet they fall short in effectively aligning language and motion due to CLIP's pretraining on static image-text pairs. This work introduces LaMP, a novel Language-Motion Pretraining model, which transitions from a language-vision to a more suitable language-motion latent space. It addresses key limitations by generating motion-informative text embeddings, significantly enhancing the relevance and semantics of generated motion sequences. With LaMP, we advance three key tasks: text-to-motion generation, motion-text retrieval, and motion captioning through aligned language-motion representation learning. For generation, we utilize LaMP to provide the text condition instead of CLIP, and an autoregressive masked prediction is designed to achieve mask modeling without rank collapse in transformers. For retrieval, motion features from LaMP's motion transformer interact with query tokens to retrieve text features from the text transformer, and vice versa. For captioning, we finetune a large language model with the language-informative motion features to develop a strong motion captioning model. In addition, we introduce the LaMP-BertScore metric to assess the alignment of generated motions with textual descriptions. Extensive experimental results on multiple datasets demonstrate substantial improvements over previous methods across all three tasks. The code of our method will be made public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07093v1-abstract-full').style.display = 'none'; document.getElementById('2410.07093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05605">arXiv:2410.05605</a> <span> [<a href="https://arxiv.org/pdf/2410.05605">pdf</a>, <a href="https://arxiv.org/format/2410.05605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CodeDPO: Aligning Code Models with Self Generated and Verified Source Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kechi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingjing Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jing Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongfei Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05605v1-abstract-short" style="display: inline;"> Code generation models have shown significant potential for programming tasks. However, existing training methods like supervised fine-tuning face key limitations: they do not effectively teach models to prioritize correct over incorrect solutions in ambiguous situations, nor do they effectively optimize the runtime efficiency of the generated code. To address these challenges, we propose CodeDPO,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05605v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05605v1-abstract-full" style="display: none;"> Code generation models have shown significant potential for programming tasks. However, existing training methods like supervised fine-tuning face key limitations: they do not effectively teach models to prioritize correct over incorrect solutions in ambiguous situations, nor do they effectively optimize the runtime efficiency of the generated code. To address these challenges, we propose CodeDPO, a framework that integrates preference learning into code generation to improve two key code preference factors: code correctness and efficiency. CodeDPO employs a novel dataset construction method, utilizing a self-generation-and-validation mechanism that simultaneously generates and evaluates code and test cases. The underlying assumption is that test cases executable by multiple code snippets provide more reliable validation, and code that passes more tests is more likely to be correct. Through this self-validation process, our PageRank-inspired algorithm iteratively updates the ranking score of each code snippet, ultimately creating a code preference optimization dataset based on correctness and efficiency. CodeDPO is flexible and scalable, generating diverse preference optimization data without depending on external resources. Through comprehensive evaluations of five widely used benchmarks, CodeDPO demonstrates significant improvements in correctness and efficiency compared to existing methods. Our experiments prove that CodeDPO enhances the capabilities of LLMs in code generation and provides a robust foundation for conducting code preference optimization in more complex and challenging real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05605v1-abstract-full').style.display = 'none'; document.getElementById('2410.05605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05255">arXiv:2410.05255</a> <span> [<a href="https://arxiv.org/pdf/2410.05255">pdf</a>, <a href="https://arxiv.org/format/2410.05255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SePPO: Semi-Policy Preference Optimization for Diffusion Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Daoan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+G">Guangchen Lan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+D">Dong-Jun Han</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wenlin Yao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xiaoman Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongming Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxiao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yu Dong</a>, <a href="/search/cs?searchtype=author&query=Brinton%2C+C">Christopher Brinton</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05255v1-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) methods are emerging as a way to fine-tune diffusion models (DMs) for visual generation. However, commonly used on-policy strategies are limited by the generalization capability of the reward model, while off-policy approaches require large amounts of difficult-to-obtain paired human-annotated data, particularly in visual generation tasks. To addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05255v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05255v1-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) methods are emerging as a way to fine-tune diffusion models (DMs) for visual generation. However, commonly used on-policy strategies are limited by the generalization capability of the reward model, while off-policy approaches require large amounts of difficult-to-obtain paired human-annotated data, particularly in visual generation tasks. To address the limitations of both on- and off-policy RLHF, we propose a preference optimization method that aligns DMs with preferences without relying on reward models or paired human-annotated data. Specifically, we introduce a Semi-Policy Preference Optimization (SePPO) method. SePPO leverages previous checkpoints as reference models while using them to generate on-policy reference samples, which replace "losing images" in preference pairs. This approach allows us to optimize using only off-policy "winning images." Furthermore, we design a strategy for reference model selection that expands the exploration in the policy space. Notably, we do not simply treat reference samples as negative examples for learning. Instead, we design an anchor-based criterion to assess whether the reference samples are likely to be winning or losing images, allowing the model to selectively learn from the generated reference samples. This approach mitigates performance degradation caused by the uncertainty in reference sample quality. We validate SePPO across both text-to-image and text-to-video benchmarks. SePPO surpasses all previous approaches on the text-to-image benchmarks and also demonstrates outstanding performance on the text-to-video benchmarks. Code will be released in https://github.com/DwanZhang-AI/SePPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05255v1-abstract-full').style.display = 'none'; document.getElementById('2410.05255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05161">arXiv:2410.05161</a> <span> [<a href="https://arxiv.org/pdf/2410.05161">pdf</a>, <a href="https://arxiv.org/format/2410.05161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Seesaw Model Attack Algorithm for Distributed Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kun Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+T">Tianyi Luo</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yanjie Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aohan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05161v1-abstract-short" style="display: inline;"> We investigate the Byzantine attack problem within the context of model training in distributed learning systems. While ensuring the convergence of current model training processes, common solvers (e.g. SGD, Adam, RMSProp, etc.) can be easily compromised by malicious nodes in these systems. Consequently, the training process may either converge slowly or even diverge. To develop effective secure d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05161v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05161v1-abstract-full" style="display: none;"> We investigate the Byzantine attack problem within the context of model training in distributed learning systems. While ensuring the convergence of current model training processes, common solvers (e.g. SGD, Adam, RMSProp, etc.) can be easily compromised by malicious nodes in these systems. Consequently, the training process may either converge slowly or even diverge. To develop effective secure distributed learning solvers, it is crucial to first examine attack methods to assess the robustness of these solvers. In this work, we contribute to the design of attack strategies by initially highlighting the limitations of finite-norm attacks. We then introduce the seesaw attack, which has been demonstrated to be more effective than the finite-norm attack. Through numerical experiments, we evaluate the efficacy of the seesaw attack across various gradient aggregation rules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05161v1-abstract-full').style.display = 'none'; document.getElementById('2410.05161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at IEEE SmartIoT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04659">arXiv:2410.04659</a> <span> [<a href="https://arxiv.org/pdf/2410.04659">pdf</a>, <a href="https://arxiv.org/format/2410.04659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ActiView: Evaluating Active Perception Ability for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyue Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuwen Luo</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yurui Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuzhuang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04659v1-abstract-short" style="display: inline;"> Active perception, a crucial human capability, involves setting a goal based on the current understanding of the environment and performing actions to achieve that goal. Despite significant efforts in evaluating Multimodal Large Language Models (MLLMs), active perception has been largely overlooked. To address this gap, we propose a novel benchmark named ActiView to evaluate active perception in M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04659v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04659v1-abstract-full" style="display: none;"> Active perception, a crucial human capability, involves setting a goal based on the current understanding of the environment and performing actions to achieve that goal. Despite significant efforts in evaluating Multimodal Large Language Models (MLLMs), active perception has been largely overlooked. To address this gap, we propose a novel benchmark named ActiView to evaluate active perception in MLLMs. Since comprehensively assessing active perception is challenging, we focus on a specialized form of Visual Question Answering (VQA) that eases the evaluation yet challenging for existing MLLMs. Given an image, we restrict the perceptual field of a model, requiring it to actively zoom or shift its perceptual field based on reasoning to answer the question successfully. We conduct extensive evaluation over 27 models, including proprietary and open-source models, and observe that the ability to read and comprehend multiple images simultaneously plays a significant role in enabling active perception. Results reveal a significant gap in the active perception capability of MLLMs, indicating that this area deserves more attention. We hope that our benchmark could help develop methods for MLLMs to understand multimodal inputs in more natural and holistic ways. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04659v1-abstract-full').style.display = 'none'; document.getElementById('2410.04659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04190">arXiv:2410.04190</a> <span> [<a href="https://arxiv.org/pdf/2410.04190">pdf</a>, <a href="https://arxiv.org/format/2410.04190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Task Overload for Scalable Jailbreak Attacks on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiting Dong</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guobin Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiang He</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04190v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) remain vulnerable to jailbreak attacks that bypass their safety mechanisms. Existing attack methods are fixed or specifically tailored for certain models and cannot flexibly adjust attack strength, which is critical for generalization when attacking models of various sizes. We introduce a novel scalable jailbreak attack that preempts the activation of an LLM's safety p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04190v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04190v1-abstract-full" style="display: none;"> Large Language Models (LLMs) remain vulnerable to jailbreak attacks that bypass their safety mechanisms. Existing attack methods are fixed or specifically tailored for certain models and cannot flexibly adjust attack strength, which is critical for generalization when attacking models of various sizes. We introduce a novel scalable jailbreak attack that preempts the activation of an LLM's safety policies by occupying its computational resources. Our method involves engaging the LLM in a resource-intensive preliminary task - a Character Map lookup and decoding process - before presenting the target instruction. By saturating the model's processing capacity, we prevent the activation of safety protocols when processing the subsequent instruction. Extensive experiments on state-of-the-art LLMs demonstrate that our method achieves a high success rate in bypassing safety measures without requiring gradient access, manual prompt engineering. We verified our approach offers a scalable attack that quantifies attack strength and adapts to different model scales at the optimal strength. We shows safety policies of LLMs might be more susceptible to resource constraints. Our findings reveal a critical vulnerability in current LLM safety designs, highlighting the need for more robust defense strategies that account for resource-intense condition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04190v1-abstract-full').style.display = 'none'; document.getElementById('2410.04190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02675">arXiv:2410.02675</a> <span> [<a href="https://arxiv.org/pdf/2410.02675">pdf</a>, <a href="https://arxiv.org/format/2410.02675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FAN: Fourier Analysis Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yongding Tao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xue Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kechi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jing Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingjing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02675v2-abstract-short" style="display: inline;"> Despite the remarkable success achieved by neural networks, particularly those represented by MLP and Transformer, we reveal that they exhibit potential flaws in the modeling and reasoning of periodicity, i.e., they tend to memorize the periodic data rather than genuinely understanding the underlying principles of periodicity. However, periodicity is a crucial trait in various forms of reasoning a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02675v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02675v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02675v2-abstract-full" style="display: none;"> Despite the remarkable success achieved by neural networks, particularly those represented by MLP and Transformer, we reveal that they exhibit potential flaws in the modeling and reasoning of periodicity, i.e., they tend to memorize the periodic data rather than genuinely understanding the underlying principles of periodicity. However, periodicity is a crucial trait in various forms of reasoning and generalization, underpinning predictability across natural and engineered systems through recurring patterns in observations. In this paper, we propose FAN, a novel network architecture based on Fourier Analysis, which empowers the ability to efficiently model and reason about periodic phenomena. By introducing Fourier Series, the periodicity is naturally integrated into the structure and computational processes of the neural network, thus achieving a more accurate expression and prediction of periodic patterns. As a promising substitute to multi-layer perceptron (MLP), FAN can seamlessly replace MLP in various models with fewer parameters and FLOPs. Through extensive experiments, we demonstrate the effectiveness of FAN in modeling and reasoning about periodic functions, and the superiority and generalizability of FAN across a range of real-world tasks, including symbolic formula representation, time series forecasting, and language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02675v2-abstract-full').style.display = 'none'; document.getElementById('2410.02675v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02298">arXiv:2410.02298</a> <span> [<a href="https://arxiv.org/pdf/2410.02298">pdf</a>, <a href="https://arxiv.org/format/2410.02298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jailbreak Antidote: Runtime Safety-Utility Balance via Sparse Representation Adjustment in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guobin Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiting Dong</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiang He</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02298v2-abstract-short" style="display: inline;"> As large language models (LLMs) become integral to various applications, ensuring both their safety and utility is paramount. Jailbreak attacks, which manipulate LLMs into generating harmful content, pose significant challenges to this balance. Existing defenses, such as prompt engineering and safety fine-tuning, often introduce computational overhead, increase inference latency, and lack runtime… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02298v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02298v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02298v2-abstract-full" style="display: none;"> As large language models (LLMs) become integral to various applications, ensuring both their safety and utility is paramount. Jailbreak attacks, which manipulate LLMs into generating harmful content, pose significant challenges to this balance. Existing defenses, such as prompt engineering and safety fine-tuning, often introduce computational overhead, increase inference latency, and lack runtime flexibility. Moreover, overly restrictive safety measures can degrade model utility by causing refusals of benign queries. In this paper, we introduce Jailbreak Antidote, a method that enables real-time adjustment of LLM safety preferences by manipulating a sparse subset of the model's internal states during inference. By shifting the model's hidden representations along a safety direction with varying strengths, we achieve flexible control over the safety-utility balance without additional token overhead or inference delays. Our analysis reveals that safety-related information in LLMs is sparsely distributed; adjusting approximately 5% of the internal state is as effective as modifying the entire state. Extensive experiments on nine LLMs (ranging from 2 billion to 72 billion parameters), evaluated against ten jailbreak attack methods and compared with six defense strategies, validate the effectiveness and efficiency of our approach. By directly manipulating internal states during reasoning, Jailbreak Antidote offers a lightweight, scalable solution that enhances LLM safety while preserving utility, opening new possibilities for real-time safety mechanisms in widely-deployed AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02298v2-abstract-full').style.display = 'none'; document.getElementById('2410.02298v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01860">arXiv:2410.01860</a> <span> [<a href="https://arxiv.org/pdf/2410.01860">pdf</a>, <a href="https://arxiv.org/format/2410.01860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FredNormer: Frequency Domain Normalization for Non-stationary Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Piao%2C+X">Xihao Piao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zheng Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Matsubara%2C+Y">Yasuko Matsubara</a>, <a href="/search/cs?searchtype=author&query=Sakurai%2C+Y">Yasushi Sakurai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01860v4-abstract-short" style="display: inline;"> Recent normalization-based methods have shown great success in tackling the distribution shift issue, facilitating non-stationary time series forecasting. Since these methods operate in the time domain, they may fail to fully capture the dynamic patterns that are more apparent in the frequency domain, leading to suboptimal results. This paper first theoretically analyzes how normalization methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01860v4-abstract-full').style.display = 'inline'; document.getElementById('2410.01860v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01860v4-abstract-full" style="display: none;"> Recent normalization-based methods have shown great success in tackling the distribution shift issue, facilitating non-stationary time series forecasting. Since these methods operate in the time domain, they may fail to fully capture the dynamic patterns that are more apparent in the frequency domain, leading to suboptimal results. This paper first theoretically analyzes how normalization methods affect frequency components. We prove that the current normalization methods that operate in the time domain uniformly scale non-zero frequencies, and thus, they struggle to determine components that contribute to more robust forecasting. Therefore, we propose FredNormer, which observes datasets from a frequency perspective and adaptively up-weights the key frequency components. To this end, FredNormer consists of two components: a statistical metric that normalizes the input samples based on their frequency stability and a learnable weighting layer that adjusts stability and introduces sample-specific variations. Notably, FredNormer is a plug-and-play module, which does not compromise the efficiency compared to existing normalization methods. Extensive experiments show that FredNormer improves the averaged MSE of backbone forecasting models by 33.3% and 55.3% on the ETTm2 dataset. Compared to the baseline normalization methods, FredNormer achieves 18 top-1 results and 6 top-2 results out of 28 settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01860v4-abstract-full').style.display = 'none'; document.getElementById('2410.01860v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Dong%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository