Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,773 results for author: <span class="mathjax">Zhu, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhu%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhu, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhu%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhu, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14029">arXiv:2411.14029</a> <span> [<a href="https://arxiv.org/pdf/2411.14029">pdf</a>, <a href="https://arxiv.org/format/2411.14029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Relation-aware based Siamese Denoising Autoencoder for Malware Few-shot Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinting Zhu</a>, <a href="/search/cs?searchtype=author&query=Jang-Jaccard%2C+J">Julian Jang-Jaccard</a>, <a href="/search/cs?searchtype=author&query=Welch%2C+I">Ian Welch</a>, <a href="/search/cs?searchtype=author&query=AI-Sahaf%2C+H">Harith AI-Sahaf</a>, <a href="/search/cs?searchtype=author&query=Camtepe%2C+S">Seyit Camtepe</a>, <a href="/search/cs?searchtype=author&query=Dunmore%2C+A">Aeryn Dunmore</a>, <a href="/search/cs?searchtype=author&query=Lab%2C+C">Cybersecurity Lab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14029v1-abstract-short" style="display: inline;"> When malware employs an unseen zero-day exploit, traditional security measures such as vulnerability scanners and antivirus software can fail to detect them. This is because these tools rely on known patches and signatures, which do not exist for new zero-day attacks. Furthermore, existing machine learning methods, which are trained on specific and occasionally outdated malware samples, may strugg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14029v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14029v1-abstract-full" style="display: none;"> When malware employs an unseen zero-day exploit, traditional security measures such as vulnerability scanners and antivirus software can fail to detect them. This is because these tools rely on known patches and signatures, which do not exist for new zero-day attacks. Furthermore, existing machine learning methods, which are trained on specific and occasionally outdated malware samples, may struggle to adapt to features in new malware. To address this issue, there is a need for a more robust machine learning model that can identify relationships between malware samples without being trained on a particular malware feature set. This is particularly crucial in the field of cybersecurity, where the number of malware samples is limited and obfuscation techniques are widely used. Current approaches using stacked autoencoders aim to remove the noise introduced by obfuscation techniques through reconstruction of the input. However, this approach ignores the semantic relationships between features across different malware samples. To overcome this limitation, we propose a novel Siamese Neural Network (SNN) that uses relation-aware embeddings to calculate more accurate similarity probabilities based on semantic details of different malware samples. In addition, by using entropy images as inputs, our model can extract better structural information and subtle differences in malware signatures, even in the presence of obfuscation techniques. Evaluations on two large malware sample sets using the N-shot and N-way methods show that our proposed model is highly effective in predicting previously unseen malware, even in the presence of obfuscation techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14029v1-abstract-full').style.display = 'none'; document.getElementById('2411.14029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12235">arXiv:2411.12235</a> <span> [<a href="https://arxiv.org/pdf/2411.12235">pdf</a>, <a href="https://arxiv.org/format/2411.12235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BoolQuestions: Does Dense Retrieval Understand Boolean Logic in Language? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongmeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wengang Zhou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiang Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Houqiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12235v1-abstract-short" style="display: inline;"> Dense retrieval, which aims to encode the semantic information of arbitrary text into dense vector representations or embeddings, has emerged as an effective and efficient paradigm for text retrieval, consequently becoming an essential component in various natural language processing systems. These systems typically focus on optimizing the embedding space by attending to the relevance of text pair… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12235v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12235v1-abstract-full" style="display: none;"> Dense retrieval, which aims to encode the semantic information of arbitrary text into dense vector representations or embeddings, has emerged as an effective and efficient paradigm for text retrieval, consequently becoming an essential component in various natural language processing systems. These systems typically focus on optimizing the embedding space by attending to the relevance of text pairs, while overlooking the Boolean logic inherent in language, which may not be captured by current training objectives. In this work, we first investigate whether current retrieval systems can comprehend the Boolean logic implied in language. To answer this question, we formulate the task of Boolean Dense Retrieval and collect a benchmark dataset, BoolQuestions, which covers complex queries containing basic Boolean logic and corresponding annotated passages. Through extensive experimental results on the proposed task and benchmark dataset, we draw the conclusion that current dense retrieval systems do not fully understand Boolean logic in language, and there is a long way to go to improve our dense retrieval systems. Furthermore, to promote further research on enhancing the understanding of Boolean logic for language models, we explore Boolean operation on decomposed query and propose a contrastive continual training method that serves as a strong baseline for the research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12235v1-abstract-full').style.display = 'none'; document.getElementById('2411.12235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of the Association for Computational Linguistics: EMNLP 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 2767-2779 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10958">arXiv:2411.10958</a> <span> [<a href="https://arxiv.org/pdf/2411.10958">pdf</a>, <a href="https://arxiv.org/format/2411.10958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> SageAttention2 Technical Report: Accurate 4 Bit Attention for Plug-and-play Inference Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintao Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haofeng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengle Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jia Wei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10958v1-abstract-short" style="display: inline;"> Although quantization for linear layers has been widely used, its application to accelerate the attention process remains limited. SageAttention utilizes 8-bit matrix multiplication, 16-bit matrix multiplication with 16-bit accumulator, and precision-enhancing methods, implementing an accurate and 2x speedup kernel compared to FlashAttention2. To further enhance the efficiency of attention computa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10958v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10958v1-abstract-full" style="display: none;"> Although quantization for linear layers has been widely used, its application to accelerate the attention process remains limited. SageAttention utilizes 8-bit matrix multiplication, 16-bit matrix multiplication with 16-bit accumulator, and precision-enhancing methods, implementing an accurate and 2x speedup kernel compared to FlashAttention2. To further enhance the efficiency of attention computation while maintaining precision, we propose SageAttention2, which utilizes significantly faster 4-bit matrix multiplication (Matmul) alongside additional precision-enhancing techniques. First, we propose to quantize matrixes $(Q, K)$ to INT4 in a warp-level granularity and quantize matrixes $(\widetilde P, V)$ to FP8. Second, we propose a method to smooth $Q$ and $V$, enhancing the accuracy of attention with INT4 $QK$ and FP8 $PV$. Third, we analyze the quantization accuracy across timesteps and layers, then propose an adaptive quantization method to ensure the end-to-end metrics over various models. The operations per second (OPS) of SageAttention2 surpass FlashAttention2 and xformers by about 3x and 5x on RTX4090, respectively. Comprehensive experiments confirm that our approach incurs negligible end-to-end metrics loss across diverse models, including those for large language processing, image generation, and video generation. The codes are available at https://github.com/thu-ml/SageAttention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10958v1-abstract-full').style.display = 'none'; document.getElementById('2411.10958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10442">arXiv:2411.10442</a> <span> [<a href="https://arxiv.org/pdf/2411.10442">pdf</a>, <a href="https://arxiv.org/format/2411.10442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yue Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yangzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lewei Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10442v1-abstract-short" style="display: inline;"> Existing open-source multimodal large language models (MLLMs) generally follow a training process involving pre-training and supervised fine-tuning. However, these models suffer from distribution shifts, which limit their multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance. To address this, we introduce a preference optimization (PO) process to enhance the multimodal reaso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10442v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10442v1-abstract-full" style="display: none;"> Existing open-source multimodal large language models (MLLMs) generally follow a training process involving pre-training and supervised fine-tuning. However, these models suffer from distribution shifts, which limit their multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance. To address this, we introduce a preference optimization (PO) process to enhance the multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data side, we design an automated preference data construction pipeline to create MMPR, a high-quality, large-scale multimodal reasoning preference dataset. and (2) on the model side, we explore integrating PO with MLLMs, developing a simple yet effective method, termed Mixed Preference Optimization (MPO), which boosts multimodal CoT performance. Our approach demonstrates improved performance across multiple benchmarks, particularly in multimodal reasoning tasks. Notably, our model, InternVL2-8B-MPO, achieves an accuracy of 67.0 on MathVista, outperforming InternVL2-8B by 8.7 points and achieving performance comparable to the 10x larger InternVL2-76B. We hope this study could inspire further advancements in MLLMs. Code, data, and model shall be publicly released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10442v1-abstract-full').style.display = 'none'; document.getElementById('2411.10442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10346">arXiv:2411.10346</a> <span> [<a href="https://arxiv.org/pdf/2411.10346">pdf</a>, <a href="https://arxiv.org/format/2411.10346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BiDense: Binarization for Dense Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+R">Rui Yin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haotong Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+B">Biao Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10346v2-abstract-short" style="display: inline;"> Dense prediction is a critical task in computer vision. However, previous methods often require extensive computational resources, which hinders their real-world application. In this paper, we propose BiDense, a generalized binary neural network (BNN) designed for efficient and accurate dense prediction tasks. BiDense incorporates two key techniques: the Distribution-adaptive Binarizer (DAB) and t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10346v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10346v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10346v2-abstract-full" style="display: none;"> Dense prediction is a critical task in computer vision. However, previous methods often require extensive computational resources, which hinders their real-world application. In this paper, we propose BiDense, a generalized binary neural network (BNN) designed for efficient and accurate dense prediction tasks. BiDense incorporates two key techniques: the Distribution-adaptive Binarizer (DAB) and the Channel-adaptive Full-precision Bypass (CFB). The DAB adaptively calculates thresholds and scaling factors for binarization, effectively retaining more information within BNNs. Meanwhile, the CFB facilitates full-precision bypassing for binary convolutional layers undergoing various channel size transformations, which enhances the propagation of real-valued signals and minimizes information loss. By leveraging these techniques, BiDense preserves more real-valued information, enabling more accurate and detailed dense predictions in BNNs. Extensive experiments demonstrate that our framework achieves performance levels comparable to full-precision models while significantly reducing memory usage and computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10346v2-abstract-full').style.display = 'none'; document.getElementById('2411.10346v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10023">arXiv:2411.10023</a> <span> [<a href="https://arxiv.org/pdf/2411.10023">pdf</a>, <a href="https://arxiv.org/format/2411.10023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model Inversion Attacks: A Survey of Approaches and Countermeasures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhanke Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianing Zhu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fengfei Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuan Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiong Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10023v1-abstract-short" style="display: inline;"> The success of deep neural networks has driven numerous research studies and applications from Euclidean to non-Euclidean data. However, there are increasing concerns about privacy leakage, as these networks rely on processing private data. Recently, a new type of privacy attack, the model inversion attacks (MIAs), aims to extract sensitive features of private data for training by abusing access t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10023v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10023v1-abstract-full" style="display: none;"> The success of deep neural networks has driven numerous research studies and applications from Euclidean to non-Euclidean data. However, there are increasing concerns about privacy leakage, as these networks rely on processing private data. Recently, a new type of privacy attack, the model inversion attacks (MIAs), aims to extract sensitive features of private data for training by abusing access to a well-trained model. The effectiveness of MIAs has been demonstrated in various domains, including images, texts, and graphs. These attacks highlight the vulnerability of neural networks and raise awareness about the risk of privacy leakage within the research community. Despite the significance, there is a lack of systematic studies that provide a comprehensive overview and deeper insights into MIAs across different domains. This survey aims to summarize up-to-date MIA methods in both attacks and defenses, highlighting their contributions and limitations, underlying modeling principles, optimization challenges, and future directions. We hope this survey bridges the gap in the literature and facilitates future research in this critical area. Besides, we are maintaining a repository to keep track of relevant research at https://github.com/AndrewZhou924/Awesome-model-inversion-attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10023v1-abstract-full').style.display = 'none'; document.getElementById('2411.10023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09866">arXiv:2411.09866</a> <span> [<a href="https://arxiv.org/pdf/2411.09866">pdf</a>, <a href="https://arxiv.org/ps/2411.09866">ps</a>, <a href="https://arxiv.org/format/2411.09866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Power Allocation for Compute-and-Forward over Fading Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lanwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Evans%2C+J">Jamie Evans</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingge Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09866v1-abstract-short" style="display: inline;"> Compute-and-forward (CF) is a relaying strategy which allows the relay to decode a linear combination of the transmitted messages. This work studies the optimal power allocation problem for the CF scheme in fast fading channels for maximizing the symmetric computation rate, which is a non-convex optimization problem with no simple analytical or numerical solutions. In the first part of the paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09866v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09866v1-abstract-full" style="display: none;"> Compute-and-forward (CF) is a relaying strategy which allows the relay to decode a linear combination of the transmitted messages. This work studies the optimal power allocation problem for the CF scheme in fast fading channels for maximizing the symmetric computation rate, which is a non-convex optimization problem with no simple analytical or numerical solutions. In the first part of the paper, we investigate the problem when there are finitely many channel states (discrete case). We establish several important properties of the optimal solutions and show that if all users share the same power allocation policy (symmetric policy), the optimal solution takes the form of a water-filling type when the power constraint exceeds a certain threshold. However, if asymmetric policies are allowed, the optimal solution does not take this form for any power constraint. We propose a low-complexity order-based algorithm for both scenarios and compare its performance with baseline algorithms. In the second part of the paper, we state relevant results when the channel coefficients are modelled as continuous random variables (continuous case) and propose a similar low-complexity iterative algorithm for the symmetric policy scenario. Numerical results are provided for both discrete and continuous cases. It is shown that in general our proposed algorithm finds good suboptimal solutions with low complexity, and for some examples considered, finds an exact optimal solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09866v1-abstract-full').style.display = 'none'; document.getElementById('2411.09866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09595">arXiv:2411.09595</a> <span> [<a href="https://arxiv.org/pdf/2411.09595">pdf</a>, <a href="https://arxiv.org/format/2411.09595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Lorraine%2C+J">Jonathan Lorraine</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiaohui Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09595v1-abstract-short" style="display: inline;"> This work explores expanding the capabilities of large language models (LLMs) pretrained on text to generate 3D meshes within a unified model. This offers key advantages of (1) leveraging spatial knowledge already embedded in LLMs, derived from textual sources like 3D tutorials, and (2) enabling conversational 3D generation and mesh understanding. A primary challenge is effectively tokenizing 3D m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09595v1-abstract-full" style="display: none;"> This work explores expanding the capabilities of large language models (LLMs) pretrained on text to generate 3D meshes within a unified model. This offers key advantages of (1) leveraging spatial knowledge already embedded in LLMs, derived from textual sources like 3D tutorials, and (2) enabling conversational 3D generation and mesh understanding. A primary challenge is effectively tokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly. To address this, we introduce LLaMA-Mesh, a novel approach that represents the vertex coordinates and face definitions of 3D meshes as plain text, allowing direct integration with LLMs without expanding the vocabulary. We construct a supervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate 3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs as required, and (3) understand and interpret 3D meshes. Our work is the first to demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge for 3D mesh generation in a text-based format, effectively unifying the 3D and text modalities. LLaMA-Mesh achieves mesh generation quality on par with models trained from scratch while maintaining strong text generation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09595v1-abstract-full').style.display = 'none'; document.getElementById('2411.09595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">See the project website at https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.5; I.2.10; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09100">arXiv:2411.09100</a> <span> [<a href="https://arxiv.org/pdf/2411.09100">pdf</a>, <a href="https://arxiv.org/format/2411.09100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> General linear threshold models with application to influence maximization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kagan%2C+A">Alexander Kagan</a>, <a href="/search/cs?searchtype=author&query=Levina%2C+E">Elizaveta Levina</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Ji Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09100v1-abstract-short" style="display: inline;"> A number of models have been developed for information spread through networks, often for solving the Influence Maximization (IM) problem. IM is the task of choosing a fixed number of nodes to "seed" with information in order to maximize the spread of this information through the network, with applications in areas such as marketing and public health. Most methods for this problem rely heavily on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09100v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09100v1-abstract-full" style="display: none;"> A number of models have been developed for information spread through networks, often for solving the Influence Maximization (IM) problem. IM is the task of choosing a fixed number of nodes to "seed" with information in order to maximize the spread of this information through the network, with applications in areas such as marketing and public health. Most methods for this problem rely heavily on the assumption of known strength of connections between network members (edge weights), which is often unrealistic. In this paper, we develop a likelihood-based approach to estimate edge weights from the fully and partially observed information diffusion paths. We also introduce a broad class of information diffusion models, the general linear threshold (GLT) model, which generalizes the well-known linear threshold (LT) model by allowing arbitrary distributions of node activation thresholds. We then show our weight estimator is consistent under the GLT and some mild assumptions. For the special case of the standard LT model, we also present a much faster expectation-maximization approach for weight estimation. Finally, we prove that for the GLT models, the IM problem can be solved by a natural greedy algorithm with standard optimality guarantees if all node threshold distributions have concave cumulative distribution functions. Extensive experiments on synthetic and real-world networks demonstrate that the flexibility in the choice of threshold distribution combined with the estimation of edge weights significantly improves the quality of IM solutions, spread prediction, and the estimates of the node activation probabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09100v1-abstract-full').style.display = 'none'; document.getElementById('2411.09100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08606">arXiv:2411.08606</a> <span> [<a href="https://arxiv.org/pdf/2411.08606">pdf</a>, <a href="https://arxiv.org/format/2411.08606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LG-Gaze: Learning Geometry-aware Continuous Prompts for Language-Guided Gaze Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+P">Pengwei Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjing Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+G">Guanzhong Zeng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+D">Di Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08606v1-abstract-short" style="display: inline;"> The ability of gaze estimation models to generalize is often significantly hindered by various factors unrelated to gaze, especially when the training dataset is limited. Current strategies aim to address this challenge through different domain generalization techniques, yet they have had limited success due to the risk of overfitting when solely relying on value labels for regression. Recent prog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08606v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08606v1-abstract-full" style="display: none;"> The ability of gaze estimation models to generalize is often significantly hindered by various factors unrelated to gaze, especially when the training dataset is limited. Current strategies aim to address this challenge through different domain generalization techniques, yet they have had limited success due to the risk of overfitting when solely relying on value labels for regression. Recent progress in pre-trained vision-language models has motivated us to capitalize on the abundant semantic information available. We propose a novel approach in this paper, reframing the gaze estimation task as a vision-language alignment issue. Our proposed framework, named Language-Guided Gaze Estimation (LG-Gaze), learns continuous and geometry-sensitive features for gaze estimation benefit from the rich prior knowledges of vision-language models. Specifically, LG-Gaze aligns gaze features with continuous linguistic features through our proposed multimodal contrastive regression loss, which customizes adaptive weights for different negative samples. Furthermore, to better adapt to the labels for gaze estimation task, we propose a geometry-aware interpolation method to obtain more precise gaze embeddings. Through extensive experiments, we validate the efficacy of our framework in four different cross-domain evaluation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08606v1-abstract-full').style.display = 'none'; document.getElementById('2411.08606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08520">arXiv:2411.08520</a> <span> [<a href="https://arxiv.org/pdf/2411.08520">pdf</a>, <a href="https://arxiv.org/format/2411.08520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> On the Design of Variable Modulation and Adaptive Modulation for Uplink Sparse Code Multiple Access </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Q">Qu Luo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gaojie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jing Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08520v1-abstract-short" style="display: inline;"> Sparse code multiple access (SCMA) is a promising non-orthogonal multiple access scheme for enabling massive connectivity in next generation wireless networks. However, current SCMA codebooks are designed with the same size, leading to inflexibility of user grouping and supporting diverse data rates. To address this issue, we propose a variable modulation SCMA (VM-SCMA) that allows users to employ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08520v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08520v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08520v1-abstract-full" style="display: none;"> Sparse code multiple access (SCMA) is a promising non-orthogonal multiple access scheme for enabling massive connectivity in next generation wireless networks. However, current SCMA codebooks are designed with the same size, leading to inflexibility of user grouping and supporting diverse data rates. To address this issue, we propose a variable modulation SCMA (VM-SCMA) that allows users to employ codebooks with different modulation orders. To guide the VM-SCMA design, a VM matrix (VMM) that assigns modulation orders based on the SCMA factor graph is first introduced. We formulate the VM-SCMA design using the proposed average inverse product distance and the asymptotic upper bound of sum-rate, and jointly optimize the VMM, VM codebooks, power and codebook allocations. The proposed VM-SCMA not only enables diverse date rates but also supports different modulation order combinations for each rate. Leveraging these distinct advantages, we further propose an adaptive VM-SCMA (AVM-SCMA) scheme which adaptively selects the rate and the corresponding VM codebooks to adapt to the users' channel conditions by maximizing the proposed effective throughput. Simulation results show that the overall designs are able to simultaneously achieve a high-level system flexibility, enhanced error rate results, and significantly improved throughput performance, when compared to conventional SCMA schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08520v1-abstract-full').style.display = 'none'; document.getElementById('2411.08520v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08451">arXiv:2411.08451</a> <span> [<a href="https://arxiv.org/pdf/2411.08451">pdf</a>, <a href="https://arxiv.org/format/2411.08451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hao Guo</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+B">Baichun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jin Tian</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Chunzhi Yi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08451v1-abstract-short" style="display: inline;"> Embodied reference understanding is crucial for intelligent agents to predict referents based on human intention through gesture signals and language descriptions. This paper introduces the Attention-Dynamic DINO, a novel framework designed to mitigate misinterpretations of pointing gestures across various interaction contexts. Our approach integrates visual and textual features to simultaneously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08451v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08451v1-abstract-full" style="display: none;"> Embodied reference understanding is crucial for intelligent agents to predict referents based on human intention through gesture signals and language descriptions. This paper introduces the Attention-Dynamic DINO, a novel framework designed to mitigate misinterpretations of pointing gestures across various interaction contexts. Our approach integrates visual and textual features to simultaneously predict the target object's bounding box and the attention source in pointing gestures. Leveraging the distance-aware nature of nonverbal communication in visual perspective taking, we extend the virtual touch line mechanism and propose an attention-dynamic touch line to represent referring gesture based on interactive distances. The combination of this distance-aware approach and independent prediction of the attention source, enhances the alignment between objects and the gesture represented line. Extensive experiments on the YouRefIt dataset demonstrate the efficacy of our gesture information understanding method in significantly improving task performance. Our model achieves 76.4% accuracy at the 0.25 IoU threshold and, notably, surpasses human performance at the 0.75 IoU threshold, marking a first in this domain. Comparative experiments with distance-unaware understanding methods from previous research further validate the superiority of the Attention-Dynamic Touch Line across diverse contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08451v1-abstract-full').style.display = 'none'; document.getElementById('2411.08451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07037">arXiv:2411.07037</a> <span> [<a href="https://arxiv.org/pdf/2411.07037">pdf</a>, <a href="https://arxiv.org/format/2411.07037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LIFBench: Evaluating the Instruction Following Performance and Stability of Large Language Models in Long-Context Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaodong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minhao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaoming Shi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">He Yan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiangju Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Junmin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07037v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) continue to advance in natural language processing (NLP), their ability to stably follow instructions in long-context inputs has become crucial for real-world applications. While existing benchmarks assess various LLM capabilities, they rarely focus on instruction-following in long-context scenarios or stability on different inputs. In response, we introduce the Lon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07037v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07037v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) continue to advance in natural language processing (NLP), their ability to stably follow instructions in long-context inputs has become crucial for real-world applications. While existing benchmarks assess various LLM capabilities, they rarely focus on instruction-following in long-context scenarios or stability on different inputs. In response, we introduce the Long-context Instruction-Following Benchmark (LIFBench), a scalable dataset designed to evaluate LLMs' instruction-following capabilities and stability across long contexts. LIFBench comprises three long-context scenarios and eleven diverse tasks, supported by 2,766 instructions generated through an automated expansion method across three dimensions: length, expression, and variables. For evaluation, we propose LIFEval, a rubric-based assessment framework that provides precise, automated scoring of complex LLM responses without relying on LLM-assisted evaluations or human judgments. This approach facilitates a comprehensive analysis of model performance and stability across various perspectives. We conduct extensive experiments on 20 notable LLMs across six length intervals, analyzing their instruction-following capabilities and stability. Our work contributes LIFBench and LIFEval as robust tools for assessing LLM performance in complex, long-context settings, providing insights that can inform future LLM development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07037v1-abstract-full').style.display = 'none'; document.getElementById('2411.07037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05875">arXiv:2411.05875</a> <span> [<a href="https://arxiv.org/pdf/2411.05875">pdf</a>, <a href="https://arxiv.org/format/2411.05875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Improved Preference Optimization Pipeline: from Data Generation to Budget-Controlled Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuotong Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jennifer Zhu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Wanyu Du</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yanjun Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05875v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) and its variants have become the de facto standards for aligning large language models (LLMs) with human preferences or specific goals. However, DPO requires high-quality preference data and suffers from unstable preference optimization. In this work, we aim to improve the preference optimization pipeline by taking a closer look at preference data generation an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05875v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05875v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) and its variants have become the de facto standards for aligning large language models (LLMs) with human preferences or specific goals. However, DPO requires high-quality preference data and suffers from unstable preference optimization. In this work, we aim to improve the preference optimization pipeline by taking a closer look at preference data generation and training regularization techniques. For preference data generation, we demonstrate that existing scoring-based reward models produce unsatisfactory preference data and perform poorly on out-of-distribution tasks. This significantly impacts the LLM alignment performance when using these data for preference tuning. To ensure high-quality preference data generation, we propose an iterative pairwise ranking mechanism that derives preference ranking of completions using pairwise comparison signals. For training regularization, we observe that preference optimization tends to achieve better convergence when the LLM predicted likelihood of preferred samples gets slightly reduced. However, the widely used supervised next-word prediction regularization strictly prevents any likelihood reduction of preferred samples. This observation motivates our design of a budget-controlled regularization formulation. Empirically we show that combining the two designs leads to aligned models that surpass existing SOTA across two popular benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05875v1-abstract-full').style.display = 'none'; document.getElementById('2411.05875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05007">arXiv:2411.05007</a> <span> [<a href="https://arxiv.org/pdf/2411.05007">pdf</a>, <a href="https://arxiv.org/format/2411.05007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SVDQuant: Absorbing Outliers by Low-Rank Components for 4-Bit Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Muyang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujun Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianle Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junxian Guo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+E">Enze Xie</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+C">Chenlin Meng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05007v2-abstract-short" style="display: inline;"> Diffusion models have been proven highly effective at generating high-quality images. However, as these models grow larger, they require significantly more memory and suffer from higher latency, posing substantial challenges for deployment. In this work, we aim to accelerate diffusion models by quantizing their weights and activations to 4 bits. At such an aggressive level, both weights and activa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05007v2-abstract-full').style.display = 'inline'; document.getElementById('2411.05007v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05007v2-abstract-full" style="display: none;"> Diffusion models have been proven highly effective at generating high-quality images. However, as these models grow larger, they require significantly more memory and suffer from higher latency, posing substantial challenges for deployment. In this work, we aim to accelerate diffusion models by quantizing their weights and activations to 4 bits. At such an aggressive level, both weights and activations are highly sensitive, where conventional post-training quantization methods for large language models like smoothing become insufficient. To overcome this limitation, we propose SVDQuant, a new 4-bit quantization paradigm. Different from smoothing which redistributes outliers between weights and activations, our approach absorbs these outliers using a low-rank branch. We first consolidate the outliers by shifting them from activations to weights, then employ a high-precision low-rank branch to take in the weight outliers with Singular Value Decomposition (SVD). This process eases the quantization on both sides. However, na茂vely running the low-rank branch independently incurs significant overhead due to extra data movement of activations, negating the quantization speedup. To address this, we co-design an inference engine Nunchaku that fuses the kernels of the low-rank branch into those of the low-bit branch to cut off redundant memory access. It can also seamlessly support off-the-shelf low-rank adapters (LoRAs) without the need for re-quantization. Extensive experiments on SDXL, PixArt-$危$, and FLUX.1 validate the effectiveness of SVDQuant in preserving image quality. We reduce the memory usage for the 12B FLUX.1 models by 3.5$\times$, achieving 3.0$\times$ speedup over the 4-bit weight-only quantized baseline on the 16GB laptop 4090 GPU, paving the way for more interactive applications on PCs. Our quantization library and inference engine are open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05007v2-abstract-full').style.display = 'none'; document.getElementById('2411.05007v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Quantization Library: https://github.com/mit-han-lab/deepcompressor Inference Engine: https://github.com/mit-han-lab/nunchaku Website: https://hanlab.mit.edu/projects/svdquant Demo: https://svdquant.mit.edu Blog: https://hanlab.mit.edu/blog/svdquant</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04138">arXiv:2411.04138</a> <span> [<a href="https://arxiv.org/pdf/2411.04138">pdf</a>, <a href="https://arxiv.org/format/2411.04138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NetworkGym: Reinforcement Learning Environments for Multi-Access Traffic Management in Network Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haider%2C+M">Momin Haider</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Menglei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Arpit Gupta</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jing Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04138v1-abstract-short" style="display: inline;"> Mobile devices such as smartphones, laptops, and tablets can often connect to multiple access networks (e.g., Wi-Fi, LTE, and 5G) simultaneously. Recent advancements facilitate seamless integration of these connections below the transport layer, enhancing the experience for apps that lack inherent multi-path support. This optimization hinges on dynamically determining the traffic distribution acro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04138v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04138v1-abstract-full" style="display: none;"> Mobile devices such as smartphones, laptops, and tablets can often connect to multiple access networks (e.g., Wi-Fi, LTE, and 5G) simultaneously. Recent advancements facilitate seamless integration of these connections below the transport layer, enhancing the experience for apps that lack inherent multi-path support. This optimization hinges on dynamically determining the traffic distribution across networks for each device, a process referred to as \textit{multi-access traffic splitting}. This paper introduces \textit{NetworkGym}, a high-fidelity network environment simulator that facilitates generating multiple network traffic flows and multi-access traffic splitting. This simulator facilitates training and evaluating different RL-based solutions for the multi-access traffic splitting problem. Our initial explorations demonstrate that the majority of existing state-of-the-art offline RL algorithms (e.g. CQL) fail to outperform certain hand-crafted heuristic policies on average. This illustrates the urgent need to evaluate offline RL algorithms against a broader range of benchmarks, rather than relying solely on popular ones such as D4RL. We also propose an extension to the TD3+BC algorithm, named Pessimistic TD3 (PTD3), and demonstrate that it outperforms many state-of-the-art offline RL algorithms. PTD3's behavioral constraint mechanism, which relies on value-function pessimism, is theoretically motivated and relatively simple to implement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04138v1-abstract-full').style.display = 'none'; document.getElementById('2411.04138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS (Datasets and Benchmarks)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03814">arXiv:2411.03814</a> <span> [<a href="https://arxiv.org/pdf/2411.03814">pdf</a>, <a href="https://arxiv.org/format/2411.03814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Peng Xiao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaojun Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">YueFeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chongwen Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jialing Tao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hui Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03814v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate outstanding performance in their reservoir of knowledge and understanding capabilities, but they have also been shown to be prone to illegal or unethical reactions when subjected to jailbreak attacks. To ensure their responsible deployment in critical applications, it is crucial to understand the safety capabilities and vulnerabilities of LLMs. Previous wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03814v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate outstanding performance in their reservoir of knowledge and understanding capabilities, but they have also been shown to be prone to illegal or unethical reactions when subjected to jailbreak attacks. To ensure their responsible deployment in critical applications, it is crucial to understand the safety capabilities and vulnerabilities of LLMs. Previous works mainly focus on jailbreak in single-round dialogue, overlooking the potential jailbreak risks in multi-round dialogues, which are a vital way humans interact with and extract information from LLMs. Some studies have increasingly concentrated on the risks associated with jailbreak in multi-round dialogues. These efforts typically involve the use of manually crafted templates or prompt engineering techniques. However, due to the inherent complexity of multi-round dialogues, their jailbreak performance is limited. To solve this problem, we propose a novel multi-round dialogue jailbreaking agent, emphasizing the importance of stealthiness in identifying and mitigating potential threats to human values posed by LLMs. We propose a risk decomposition strategy that distributes risks across multiple rounds of queries and utilizes psychological strategies to enhance attack strength. Extensive experiments show that our proposed method surpasses other attack methods and achieves state-of-the-art attack success rate. We will make the corresponding code and dataset available for future research. The code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03814v1-abstract-full').style.display = 'none'; document.getElementById('2411.03814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03359">arXiv:2411.03359</a> <span> [<a href="https://arxiv.org/pdf/2411.03359">pdf</a>, <a href="https://arxiv.org/format/2411.03359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Calibrated Tuning of Vision-Language Models for Out-of-Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+G">Geng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianing Zhu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jiangchao Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03359v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is crucial for deploying reliable machine learning models in open-world applications. Recent advances in CLIP-based OOD detection have shown promising results via regularizing prompt tuning with OOD features extracted from ID data. However, the irrelevant context mined from ID data can be spurious due to the inaccurate foreground-background decomposition, thus l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03359v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03359v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is crucial for deploying reliable machine learning models in open-world applications. Recent advances in CLIP-based OOD detection have shown promising results via regularizing prompt tuning with OOD features extracted from ID data. However, the irrelevant context mined from ID data can be spurious due to the inaccurate foreground-background decomposition, thus limiting the OOD detection performance. In this work, we propose a novel framework, namely, Self-Calibrated Tuning (SCT), to mitigate this problem for effective OOD detection with only the given few-shot ID data. Specifically, SCT introduces modulating factors respectively on the two components of the original learning objective. It adaptively directs the optimization process between the two tasks during training on data with different prediction uncertainty to calibrate the influence of OOD regularization, which is compatible with many prompt tuning based OOD detection methods. Extensive experiments and analyses have been conducted to characterize and demonstrate the effectiveness of the proposed SCT. The code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03359v1-abstract-full').style.display = 'none'; document.getElementById('2411.03359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03042">arXiv:2411.03042</a> <span> [<a href="https://arxiv.org/pdf/2411.03042">pdf</a>, <a href="https://arxiv.org/format/2411.03042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Predictor-Corrector Enhanced Transformers with Exponential Moving Average Coefficient Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiahao Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingyan Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junliang Guo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03042v1-abstract-short" style="display: inline;"> Residual networks, as discrete approximations of Ordinary Differential Equations (ODEs), have inspired significant advancements in neural network design, including multistep methods, high-order methods, and multi-particle dynamical systems. The precision of the solution to ODEs significantly affects parameter optimization, thereby impacting model performance. In this work, we present a series of a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03042v1-abstract-full" style="display: none;"> Residual networks, as discrete approximations of Ordinary Differential Equations (ODEs), have inspired significant advancements in neural network design, including multistep methods, high-order methods, and multi-particle dynamical systems. The precision of the solution to ODEs significantly affects parameter optimization, thereby impacting model performance. In this work, we present a series of advanced explorations of Transformer architecture design to minimize the error compared to the true ``solution.'' First, we introduce a predictor-corrector learning framework to minimize truncation errors, which consists of a high-order predictor and a multistep corrector. Second, we propose an exponential moving average-based coefficient learning method to strengthen our higher-order predictor. Extensive experiments on large-scale machine translation, abstractive summarization, language modeling, and natural language understanding benchmarks demonstrate the superiority of our approach. On the WMT'14 English-German and English-French tasks, our model achieved BLEU scores of 30.95 and 44.27, respectively. Furthermore, on the OPUS multilingual machine translation task, our model surpasses a robust 3.8B DeepNet by an average of 2.9 SacreBLEU, using only 1/3 parameters. Notably, it also beats LLama models by 5.7 accuracy points on the LM Harness Evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03042v1-abstract-full').style.display = 'none'; document.getElementById('2411.03042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02448">arXiv:2411.02448</a> <span> [<a href="https://arxiv.org/pdf/2411.02448">pdf</a>, <a href="https://arxiv.org/format/2411.02448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rate, Explain and Cite (REC): Enhanced Explanation and Attribution in Automatic Evaluation by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsu%2C+A+R">Aliyah R. Hsu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">James Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhichao Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+B">Bin Bi</a>, <a href="/search/cs?searchtype=author&query=Mehrotra%2C+S">Shubham Mehrotra</a>, <a href="/search/cs?searchtype=author&query=Pentyala%2C+S+K">Shiva K. Pentyala</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K">Katherine Tan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xiang-Bo Mao</a>, <a href="/search/cs?searchtype=author&query=Omrani%2C+R">Roshanak Omrani</a>, <a href="/search/cs?searchtype=author&query=Chaudhuri%2C+S">Sougata Chaudhuri</a>, <a href="/search/cs?searchtype=author&query=Radhakrishnan%2C+R">Regunathan Radhakrishnan</a>, <a href="/search/cs?searchtype=author&query=Asur%2C+S">Sitaram Asur</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C+N">Claire Na Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02448v1-abstract-short" style="display: inline;"> LLMs have demonstrated impressive proficiency in generating coherent and high-quality text, making them valuable across a range of text-generation tasks. However, rigorous evaluation of this generated content is crucial, as ensuring its quality remains a significant challenge due to persistent issues such as factual inaccuracies and hallucinations. This paper introduces two fine-tuned general-purp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02448v1-abstract-full" style="display: none;"> LLMs have demonstrated impressive proficiency in generating coherent and high-quality text, making them valuable across a range of text-generation tasks. However, rigorous evaluation of this generated content is crucial, as ensuring its quality remains a significant challenge due to persistent issues such as factual inaccuracies and hallucinations. This paper introduces two fine-tuned general-purpose LLM autoevaluators, REC-12B and REC-70B, specifically designed to evaluate generated text across several dimensions: faithfulness, instruction following, coherence, and completeness. These models not only provide ratings for these metrics but also offer detailed explanations and verifiable citations, thereby enhancing trust in the content. Moreover, the models support various citation modes, accommodating different requirements for latency and granularity. Extensive evaluations on diverse benchmarks demonstrate that our general-purpose LLM auto-evaluator, REC-70B, outperforms state-of-the-art LLMs, excelling in content evaluation by delivering better quality explanations and citations with minimal bias. It achieves Rank \#1 as a generative model on the RewardBench leaderboard\footnote{\url{https://huggingface.co/spaces/allenai/reward-bench}} under the model name \texttt{TextEval-Llama3.1-70B}. Our REC dataset and models are released at \url{https://github.com/adelaidehsu/REC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02448v1-abstract-full').style.display = 'none'; document.getElementById('2411.02448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01850">arXiv:2411.01850</a> <span> [<a href="https://arxiv.org/pdf/2411.01850">pdf</a>, <a href="https://arxiv.org/format/2411.01850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ManiBox: Enhancing Spatial Grasping Generalization via Scalable Simulation Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hengkai Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuezhou Xu</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+C">Chengyang Ying</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xinyi Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01850v1-abstract-short" style="display: inline;"> Learning a precise robotic grasping policy is crucial for embodied agents operating in complex real-world manipulation tasks. Despite significant advancements, most models still struggle with accurate spatial positioning of objects to be grasped. We first show that this spatial generalization challenge stems primarily from the extensive data requirements for adequate spatial understanding. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01850v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01850v1-abstract-full" style="display: none;"> Learning a precise robotic grasping policy is crucial for embodied agents operating in complex real-world manipulation tasks. Despite significant advancements, most models still struggle with accurate spatial positioning of objects to be grasped. We first show that this spatial generalization challenge stems primarily from the extensive data requirements for adequate spatial understanding. However, collecting such data with real robots is prohibitively expensive, and relying on simulation data often leads to visual generalization gaps upon deployment. To overcome these challenges, we then focus on state-based policy generalization and present \textbf{ManiBox}, a novel bounding-box-guided manipulation method built on a simulation-based teacher-student framework. The teacher policy efficiently generates scalable simulation data using bounding boxes, which are proven to uniquely determine the objects' spatial positions. The student policy then utilizes these low-dimensional spatial states to enable zero-shot transfer to real robots. Through comprehensive evaluations in simulated and real-world environments, ManiBox demonstrates a marked improvement in spatial grasping generalization and adaptability to diverse objects and backgrounds. Further, our empirical study into scaling laws for policy performance indicates that spatial volume generalization scales positively with data volume. For a certain level of spatial volume, the success rate of grasping empirically follows Michaelis-Menten kinetics relative to data volume, showing a saturation effect as data increases. Our videos and code are available in https://thkkk.github.io/manibox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01850v1-abstract-full').style.display = 'none'; document.getElementById('2411.01850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00214">arXiv:2411.00214</a> <span> [<a href="https://arxiv.org/pdf/2411.00214">pdf</a>, <a href="https://arxiv.org/format/2411.00214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Inclusive KL Minimization: A Wasserstein-Fisher-Rao Gradient Flow Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jia-Jie Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00214v1-abstract-short" style="display: inline;"> Otto's (2001) Wasserstein gradient flow of the exclusive KL divergence functional provides a powerful and mathematically principled perspective for analyzing learning and inference algorithms. In contrast, algorithms for the inclusive KL inference, i.e., minimizing $ \mathrm{KL}(蟺\| 渭) $ with respect to $ 渭$ for some target $ 蟺$, are rarely analyzed using tools from mathematical analysis. This pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00214v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00214v1-abstract-full" style="display: none;"> Otto's (2001) Wasserstein gradient flow of the exclusive KL divergence functional provides a powerful and mathematically principled perspective for analyzing learning and inference algorithms. In contrast, algorithms for the inclusive KL inference, i.e., minimizing $ \mathrm{KL}(蟺\| 渭) $ with respect to $ 渭$ for some target $ 蟺$, are rarely analyzed using tools from mathematical analysis. This paper shows that a general-purpose approximate inclusive KL inference paradigm can be constructed using the theory of gradient flows derived from PDE analysis. We uncover that several existing learning algorithms can be viewed as particular realizations of the inclusive KL inference paradigm. For example, existing sampling algorithms such as Arbel et al. (2019) and Korba et al. (2021) can be viewed in a unified manner as inclusive-KL inference with approximate gradient estimators. Finally, we provide the theoretical foundation for the Wasserstein-Fisher-Rao gradient flows for minimizing the inclusive KL divergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00214v1-abstract-full').style.display = 'none'; document.getElementById('2411.00214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23991">arXiv:2410.23991</a> <span> [<a href="https://arxiv.org/pdf/2410.23991">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Localization, balance and affinity: a stronger multifaceted collaborative salient object detector in remote sensing images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yakun Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Suning Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shaohan Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Dejun Feng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Q">Qian Wan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qing Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23991v1-abstract-short" style="display: inline;"> Despite significant advancements in salient object detection(SOD) in optical remote sensing images(ORSI), challenges persist due to the intricate edge structures of ORSIs and the complexity of their contextual relationships. Current deep learning approaches encounter difficulties in accurately identifying boundary features and lack efficiency in collaboratively modeling the foreground and backgrou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23991v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23991v1-abstract-full" style="display: none;"> Despite significant advancements in salient object detection(SOD) in optical remote sensing images(ORSI), challenges persist due to the intricate edge structures of ORSIs and the complexity of their contextual relationships. Current deep learning approaches encounter difficulties in accurately identifying boundary features and lack efficiency in collaboratively modeling the foreground and background by leveraging contextual features. To address these challenges, we propose a stronger multifaceted collaborative salient object detector in ORSIs, termed LBA-MCNet, which incorporates aspects of localization, balance, and affinity. The network focuses on accurately locating targets, balancing detailed features, and modeling image-level global context information. Specifically, we design the Edge Feature Adaptive Balancing and Adjusting(EFABA) module for precise edge localization, using edge features to guide attention to boundaries and preserve spatial details. Moreover, we design the Global Distributed Affinity Learning(GDAL) module to model global context. It captures global context by generating an affinity map from the encoders final layer, ensuring effective modeling of global patterns. Additionally, deep supervision during deconvolution further enhances feature representation. Finally, we compared with 28 state of the art approaches on three publicly available datasets. The results clearly demonstrate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23991v1-abstract-full').style.display = 'none'; document.getElementById('2410.23991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23856">arXiv:2410.23856</a> <span> [<a href="https://arxiv.org/pdf/2410.23856">pdf</a>, <a href="https://arxiv.org/format/2410.23856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can Language Models Perform Robust Reasoning in Chain-of-thought Prompting with Noisy Rationales? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhanke Zhou</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+R">Rong Tao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianing Zhu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yiwen Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zengmao Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23856v1-abstract-short" style="display: inline;"> This paper investigates an under-explored challenge in large language models (LLMs): chain-of-thought prompting with noisy rationales, which include irrelevant or inaccurate reasoning thoughts within examples used for in-context learning. We construct NoRa dataset that is tailored to evaluate the robustness of reasoning in the presence of noisy rationales. Our findings on NoRa dataset reveal a pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23856v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23856v1-abstract-full" style="display: none;"> This paper investigates an under-explored challenge in large language models (LLMs): chain-of-thought prompting with noisy rationales, which include irrelevant or inaccurate reasoning thoughts within examples used for in-context learning. We construct NoRa dataset that is tailored to evaluate the robustness of reasoning in the presence of noisy rationales. Our findings on NoRa dataset reveal a prevalent vulnerability to such noise among current LLMs, with existing robust methods like self-correction and self-consistency showing limited efficacy. Notably, compared to prompting with clean rationales, base LLM drops by 1.4%-19.8% in accuracy with irrelevant thoughts and more drastically by 2.2%-40.4% with inaccurate thoughts. Addressing this challenge necessitates external supervision that should be accessible in practice. Here, we propose the method of contrastive denoising with noisy chain-of-thought (CD-CoT). It enhances LLMs' denoising-reasoning capabilities by contrasting noisy rationales with only one clean rationale, which can be the minimal requirement for denoising-purpose prompting. This method follows a principle of exploration and exploitation: (1) rephrasing and selecting rationales in the input space to achieve explicit denoising and (2) exploring diverse reasoning paths and voting on answers in the output space. Empirically, CD-CoT demonstrates an average improvement of 17.8% in accuracy over the base model and shows significantly stronger denoising capabilities than baseline methods. The source code is publicly available at: https://github.com/tmlr-group/NoisyRationales. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23856v1-abstract-full').style.display = 'none'; document.getElementById('2410.23856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23332">arXiv:2410.23332</a> <span> [<a href="https://arxiv.org/pdf/2410.23332">pdf</a>, <a href="https://arxiv.org/format/2410.23332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoLE: Enhancing Human-centric Text-to-image Diffusion via Mixture of Low-rank Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jie Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixiong Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Mingyu Ding</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Leye Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23332v1-abstract-short" style="display: inline;"> Text-to-image diffusion has attracted vast attention due to its impressive image-generation capabilities. However, when it comes to human-centric text-to-image generation, particularly in the context of faces and hands, the results often fall short of naturalness due to insufficient training priors. We alleviate the issue in this work from two perspectives. 1) From the data aspect, we carefully co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23332v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23332v1-abstract-full" style="display: none;"> Text-to-image diffusion has attracted vast attention due to its impressive image-generation capabilities. However, when it comes to human-centric text-to-image generation, particularly in the context of faces and hands, the results often fall short of naturalness due to insufficient training priors. We alleviate the issue in this work from two perspectives. 1) From the data aspect, we carefully collect a human-centric dataset comprising over one million high-quality human-in-the-scene images and two specific sets of close-up images of faces and hands. These datasets collectively provide a rich prior knowledge base to enhance the human-centric image generation capabilities of the diffusion model. 2) On the methodological front, we propose a simple yet effective method called Mixture of Low-rank Experts (MoLE) by considering low-rank modules trained on close-up hand and face images respectively as experts. This concept draws inspiration from our observation of low-rank refinement, where a low-rank module trained by a customized close-up dataset has the potential to enhance the corresponding image part when applied at an appropriate scale. To validate the superiority of MoLE in the context of human-centric image generation compared to state-of-the-art, we construct two benchmarks and perform evaluations with diverse metrics and human studies. Datasets, model, and code are released at https://sites.google.com/view/mole4diffuser/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23332v1-abstract-full').style.display = 'none'; document.getElementById('2410.23332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22637">arXiv:2410.22637</a> <span> [<a href="https://arxiv.org/pdf/2410.22637">pdf</a>, <a href="https://arxiv.org/format/2410.22637">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Consistency Diffusion Bridge Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+G">Guande He</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kaiwen Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+F">Fan Bao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22637v2-abstract-short" style="display: inline;"> Diffusion models (DMs) have become the dominant paradigm of generative modeling in a variety of domains by learning stochastic processes from noise to data. Recently, diffusion denoising bridge models (DDBMs), a new formulation of generative modeling that builds stochastic processes between fixed data endpoints based on a reference diffusion process, have achieved empirical success across tasks wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22637v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22637v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22637v2-abstract-full" style="display: none;"> Diffusion models (DMs) have become the dominant paradigm of generative modeling in a variety of domains by learning stochastic processes from noise to data. Recently, diffusion denoising bridge models (DDBMs), a new formulation of generative modeling that builds stochastic processes between fixed data endpoints based on a reference diffusion process, have achieved empirical success across tasks with coupled data distribution, such as image-to-image translation. However, DDBM's sampling process typically requires hundreds of network evaluations to achieve decent performance, which may impede their practical deployment due to high computational demands. In this work, inspired by the recent advance of consistency models in DMs, we tackle this problem by learning the consistency function of the probability-flow ordinary differential equation (PF-ODE) of DDBMs, which directly predicts the solution at a starting step given any point on the ODE trajectory. Based on a dedicated general-form ODE solver, we propose two paradigms: consistency bridge distillation and consistency bridge training, which is flexible to apply on DDBMs with broad design choices. Experimental results show that our proposed method could sample $4\times$ to $50\times$ faster than the base DDBM and produce better visual quality given the same step in various tasks with pixel resolution ranging from $64 \times 64$ to $256 \times 256$, as well as supporting downstream tasks such as semantic interpolation in the data space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22637v2-abstract-full').style.display = 'none'; document.getElementById('2410.22637v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20740">arXiv:2410.20740</a> <span> [<a href="https://arxiv.org/pdf/2410.20740">pdf</a>, <a href="https://arxiv.org/format/2410.20740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Study on Static Application Security Testing (SAST) Tools for Android </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingyun Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaixuan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sen Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lingling Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaofei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20740v1-abstract-short" style="display: inline;"> To identify security vulnerabilities in Android applications, numerous static application security testing (SAST) tools have been proposed. However, it poses significant challenges to assess their overall performance on diverse vulnerability types. The task is non-trivial and poses considerable challenges. {Firstly, the absence of a unified evaluation platform for defining and describing tools' su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20740v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20740v1-abstract-full" style="display: none;"> To identify security vulnerabilities in Android applications, numerous static application security testing (SAST) tools have been proposed. However, it poses significant challenges to assess their overall performance on diverse vulnerability types. The task is non-trivial and poses considerable challenges. {Firstly, the absence of a unified evaluation platform for defining and describing tools' supported vulnerability types, coupled with the lack of normalization for the intricate and varied reports generated by different tools, significantly adds to the complexity.} Secondly, there is a scarcity of adequate benchmarks, particularly those derived from real-world scenarios. To address these problems, we are the first to propose a unified platform named VulsTotal, supporting various vulnerability types, enabling comprehensive and versatile analysis across diverse SAST tools. Specifically, we begin by meticulously selecting 11 free and open-sourced SAST tools from a pool of 97 existing options, adhering to clearly defined criteria. After that, we invest significant efforts in comprehending the detection rules of each tool, subsequently unifying 67 general/common vulnerability types for {Android} SAST tools. We also redefine and implement a standardized reporting format, ensuring uniformity in presenting results across all tools. Additionally, to mitigate the problem of benchmarks, we conducted a manual analysis of huge amounts of CVEs to construct a new CVE-based benchmark based on our comprehension of Android app vulnerabilities. Leveraging the evaluation platform, which integrates both existing synthetic benchmarks and newly constructed CVE-based benchmarks from this study, we conducted a comprehensive analysis to evaluate and compare these selected tools from various perspectives, such as general vulnerability type coverage, type consistency, tool effectiveness, and time performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20740v1-abstract-full').style.display = 'none'; document.getElementById('2410.20740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TSE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20622">arXiv:2410.20622</a> <span> [<a href="https://arxiv.org/pdf/2410.20622">pdf</a>, <a href="https://arxiv.org/ps/2410.20622">ps</a>, <a href="https://arxiv.org/format/2410.20622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Analysis of PDEs">math.AP</span> </div> </div> <p class="title is-5 mathjax"> Kernel Approximation of Fisher-Rao Gradient Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jia-Jie Zhu</a>, <a href="/search/cs?searchtype=author&query=Mielke%2C+A">Alexander Mielke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20622v1-abstract-short" style="display: inline;"> The purpose of this paper is to answer a few open questions in the interface of kernel methods and PDE gradient flows. Motivated by recent advances in machine learning, particularly in generative modeling and sampling, we present a rigorous investigation of Fisher-Rao and Wasserstein type gradient flows concerning their gradient structures, flow equations, and their kernel approximations. Specific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20622v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20622v1-abstract-full" style="display: none;"> The purpose of this paper is to answer a few open questions in the interface of kernel methods and PDE gradient flows. Motivated by recent advances in machine learning, particularly in generative modeling and sampling, we present a rigorous investigation of Fisher-Rao and Wasserstein type gradient flows concerning their gradient structures, flow equations, and their kernel approximations. Specifically, we focus on the Fisher-Rao (also known as Hellinger) geometry and its various kernel-based approximations, developing a principled theoretical framework using tools from PDE gradient flows and optimal transport theory. We also provide a complete characterization of gradient flows in the maximum-mean discrepancy (MMD) space, with connections to existing learning and inference algorithms. Our analysis reveals precise theoretical insights linking Fisher-Rao flows, Stein flows, kernel discrepancies, and nonparametric regression. We then rigorously prove evolutionary $螕$-convergence for kernel-approximated Fisher-Rao flows, providing theoretical guarantees beyond pointwise convergence. Finally, we analyze energy dissipation using the Helmholtz-Rayleigh principle, establishing important connections between classical theory in mechanics and modern machine learning practice. Our results provide a unified theoretical foundation for understanding and analyzing approximations of gradient flows in machine learning applications through a rigorous gradient flow and variational method perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20622v1-abstract-full').style.display = 'none'; document.getElementById('2410.20622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19958">arXiv:2410.19958</a> <span> [<a href="https://arxiv.org/pdf/2410.19958">pdf</a>, <a href="https://arxiv.org/format/2410.19958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Iterative Linear Quadratic Estimation: Optimal Estimation for Hybrid Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Payne%2C+J+J">J. Joe Payne</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">James Zhu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+N+J">Nathan J. Kong</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+A+M">Aaron M. Johnson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19958v1-abstract-short" style="display: inline;"> In this paper we present Hybrid iterative Linear Quadratic Estimation (HiLQE), an optimization based offline state estimation algorithm for hybrid dynamical systems. We utilize the saltation matrix, a first order approximation of the variational update through an event driven hybrid transition, to calculate gradient information through hybrid events in the backward pass of an iterative linear quad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19958v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19958v1-abstract-full" style="display: none;"> In this paper we present Hybrid iterative Linear Quadratic Estimation (HiLQE), an optimization based offline state estimation algorithm for hybrid dynamical systems. We utilize the saltation matrix, a first order approximation of the variational update through an event driven hybrid transition, to calculate gradient information through hybrid events in the backward pass of an iterative linear quadratic optimization over state estimates. This enables accurate computation of the value function approximation at each timestep. Additionally, the forward pass in the iterative algorithm is augmented with hybrid dynamics in the rollout. A reference extension method is used to account for varying impact times when comparing states for the feedback gain in noise calculation. The proposed method is demonstrated on an ASLIP hopper system with position measurements. In comparison to the Salted Kalman Filter (SKF), the algorithm presented here achieves a maximum of 63.55% reduction in estimation error magnitude over all state dimensions near impact events. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19958v1-abstract-full').style.display = 'none'; document.getElementById('2410.19958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18472">arXiv:2410.18472</a> <span> [<a href="https://arxiv.org/pdf/2410.18472">pdf</a>, <a href="https://arxiv.org/format/2410.18472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> What If the Input is Expanded in OOD Detection? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianing Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zengmao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18472v2-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection aims to identify OOD inputs from unknown classes, which is important for the reliable deployment of machine learning models in the open world. Various scoring functions are proposed to distinguish it from in-distribution (ID) data. However, existing methods generally focus on excavating the discriminative information from a single input, which implicitly limits… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18472v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18472v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18472v2-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection aims to identify OOD inputs from unknown classes, which is important for the reliable deployment of machine learning models in the open world. Various scoring functions are proposed to distinguish it from in-distribution (ID) data. However, existing methods generally focus on excavating the discriminative information from a single input, which implicitly limits its representation dimension. In this work, we introduce a novel perspective, i.e., employing different common corruptions on the input space, to expand that. We reveal an interesting phenomenon termed confidence mutation, where the confidence of OOD data can decrease significantly under the corruptions, while the ID data shows a higher confidence expectation considering the resistance of semantic features. Based on that, we formalize a new scoring method, namely, Confidence aVerage (CoVer), which can capture the dynamic differences by simply averaging the scores obtained from different corrupted inputs and the original ones, making the OOD and ID distributions more separable in detection tasks. Extensive experiments and analyses have been conducted to understand and verify the effectiveness of CoVer. The code is publicly available at: https://github.com/tmlr-group/CoVer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18472v2-abstract-full').style.display = 'none'; document.getElementById('2410.18472v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16843">arXiv:2410.16843</a> <span> [<a href="https://arxiv.org/pdf/2410.16843">pdf</a>, <a href="https://arxiv.org/format/2410.16843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy Alignment of Retrieval-Augmented Large Language Models via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongmeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yufeng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wengang Zhou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiang Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Houqiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16843v1-abstract-short" style="display: inline;"> Trustworthiness is an essential prerequisite for the real-world application of large language models. In this paper, we focus on the trustworthiness of language models with respect to retrieval augmentation. Despite being supported with external evidence, retrieval-augmented generation still suffers from hallucinations, one primary cause of which is the conflict between contextual and parametric k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16843v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16843v1-abstract-full" style="display: none;"> Trustworthiness is an essential prerequisite for the real-world application of large language models. In this paper, we focus on the trustworthiness of language models with respect to retrieval augmentation. Despite being supported with external evidence, retrieval-augmented generation still suffers from hallucinations, one primary cause of which is the conflict between contextual and parametric knowledge. We deem that retrieval-augmented language models have the inherent capabilities of supplying response according to both contextual and parametric knowledge. Inspired by aligning language models with human preference, we take the first step towards aligning retrieval-augmented language models to a status where it responds relying merely on the external evidence and disregards the interference of parametric knowledge. Specifically, we propose a reinforcement learning based algorithm Trustworthy-Alignment, theoretically and experimentally demonstrating large language models' capability of reaching a trustworthy status without explicit supervision on how to respond. Our work highlights the potential of large language models on exploring its intrinsic abilities by its own and expands the application scenarios of alignment from fulfilling human preference to creating trustworthy agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16843v1-abstract-full').style.display = 'none'; document.getElementById('2410.16843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 41st International Conference on Machine Learning, PMLR 235:59827-59850, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16261">arXiv:2410.16261</a> <span> [<a href="https://arxiv.org/pdf/2410.16261">pdf</a>, <a href="https://arxiv.org/format/2410.16261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5% Parameters and 90% Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yiming Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lewei Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tong Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16261v3-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in vision-language tasks across a broad spectrum of domains. However, the large model scale and associated high computational costs pose significant challenges for training and deploying MLLMs on consumer-grade GPUs or edge devices, thereby hindering their widespread application. In this work, we introduce Mini-Inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16261v3-abstract-full').style.display = 'inline'; document.getElementById('2410.16261v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16261v3-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in vision-language tasks across a broad spectrum of domains. However, the large model scale and associated high computational costs pose significant challenges for training and deploying MLLMs on consumer-grade GPUs or edge devices, thereby hindering their widespread application. In this work, we introduce Mini-InternVL, a series of MLLMs with parameters ranging from 1B to 4B, which achieves 90% of the performance with only 5% of the parameters. This significant improvement in efficiency and effectiveness makes our models more accessible and applicable in various real-world scenarios. To further promote the adoption of our models, we develop a unified adaptation framework for Mini-InternVL, which enables our models to transfer and outperform specialized models in downstream tasks, including autonomous driving, medical images, and remote sensing. We believe that our study can provide valuable insights and resources to advance the development of efficient and effective MLLMs. Code is available at https://github.com/OpenGVLab/InternVL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16261v3-abstract-full').style.display = 'none'; document.getElementById('2410.16261v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15659">arXiv:2410.15659</a> <span> [<a href="https://arxiv.org/pdf/2410.15659">pdf</a>, <a href="https://arxiv.org/format/2410.15659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Decentralized Hybrid Precoding for Massive MU-MIMO ISAC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yin Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Dazhi He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Y">YunFeng Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15659v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) is a very promising technology designed to provide both high rate communication capabilities and sensing capabilities. However, in Massive Multi User Multiple-Input Multiple-Output (Massive MU MIMO-ISAC) systems, the dense user access creates a serious multi-user interference (MUI) problem, leading to degradation of communication performance. To alleviat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15659v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15659v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) is a very promising technology designed to provide both high rate communication capabilities and sensing capabilities. However, in Massive Multi User Multiple-Input Multiple-Output (Massive MU MIMO-ISAC) systems, the dense user access creates a serious multi-user interference (MUI) problem, leading to degradation of communication performance. To alleviate this problem, we propose a decentralized baseband processing (DBP) precoding method. We first model the MUI of dense user scenarios with minimizing Cramer-Rao bound (CRB) as an objective function.Hybrid precoding is an attractive ISAC technique, and hybrid precoding using Partially Connected Structures (PCS) can effectively reduce hardware cost and power consumption. We mitigate the MUI between dense users based on ThomlinsonHarashima Precoding (THP). We demonstrate the effectiveness of the proposed method through simulation experiments. Compared with the existing methods, it can effectively improve the communication data rates and energy efficiency in dense user access scenario, and reduce the hardware complexity of Massive MU MIMO-ISAC systems. The experimental results demonstrate the usefulness of our method for improving the MUI problem in ISAC systems for dense user access scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15659v1-abstract-full').style.display = 'none'; document.getElementById('2410.15659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15371">arXiv:2410.15371</a> <span> [<a href="https://arxiv.org/pdf/2410.15371">pdf</a>, <a href="https://arxiv.org/format/2410.15371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FrameBridge: Improving Image-to-Video Generation with Bridge Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuji Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zehua Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15371v1-abstract-short" style="display: inline;"> Image-to-video (I2V) generation is gaining increasing attention with its wide application in video synthesis. Recently, diffusion-based I2V models have achieved remarkable progress given their novel design on network architecture, cascaded framework, and motion representation. However, restricted by their noise-to-data generation process, diffusion-based methods inevitably suffer the difficulty to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15371v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15371v1-abstract-full" style="display: none;"> Image-to-video (I2V) generation is gaining increasing attention with its wide application in video synthesis. Recently, diffusion-based I2V models have achieved remarkable progress given their novel design on network architecture, cascaded framework, and motion representation. However, restricted by their noise-to-data generation process, diffusion-based methods inevitably suffer the difficulty to generate video samples with both appearance consistency and temporal coherence from an uninformative Gaussian noise, which may limit their synthesis quality. In this work, we present FrameBridge, taking the given static image as the prior of video target and establishing a tractable bridge model between them. By formulating I2V synthesis as a frames-to-frames generation task and modelling it with a data-to-data process, we fully exploit the information in input image and facilitate the generative model to learn the image animation process. In two popular settings of training I2V models, namely fine-tuning a pre-trained text-to-video (T2V) model or training from scratch, we further propose two techniques, SNR-Aligned Fine-tuning (SAF) and neural prior, which improve the fine-tuning efficiency of diffusion-based T2V models to FrameBridge and the synthesis quality of bridge-based I2V models respectively. Experiments conducted on WebVid-2M and UCF-101 demonstrate that: (1) our FrameBridge achieves superior I2V quality in comparison with the diffusion counterpart (zero-shot FVD 83 vs. 176 on MSR-VTT and non-zero-shot FVD 122 vs. 171 on UCF-101); (2) our proposed SAF and neural prior effectively enhance the ability of bridge-based I2V models in the scenarios of fine-tuning and training from scratch. Demo samples can be visited at: https://framebridge-demo.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15371v1-abstract-full').style.display = 'none'; document.getElementById('2410.15371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15266">arXiv:2410.15266</a> <span> [<a href="https://arxiv.org/pdf/2410.15266">pdf</a>, <a href="https://arxiv.org/format/2410.15266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> GSSF: Generalized Structural Sparse Function for Deep Cross-modal Metric Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shang Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawen Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15266v1-abstract-short" style="display: inline;"> Cross-modal metric learning is a prominent research topic that bridges the semantic heterogeneity between vision and language. Existing methods frequently utilize simple cosine or complex distance metrics to transform the pairwise features into a similarity score, which suffers from an inadequate or inefficient capability for distance measurements. Consequently, we propose a Generalized Structural… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15266v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15266v1-abstract-full" style="display: none;"> Cross-modal metric learning is a prominent research topic that bridges the semantic heterogeneity between vision and language. Existing methods frequently utilize simple cosine or complex distance metrics to transform the pairwise features into a similarity score, which suffers from an inadequate or inefficient capability for distance measurements. Consequently, we propose a Generalized Structural Sparse Function to dynamically capture thorough and powerful relationships across modalities for pair-wise similarity learning while remaining concise but efficient. Specifically, the distance metric delicately encapsulates two formats of diagonal and block-diagonal terms, automatically distinguishing and highlighting the cross-channel relevancy and dependency inside a structured and organized topology. Hence, it thereby empowers itself to adapt to the optimal matching patterns between the paired features and reaches a sweet spot between model complexity and capability. Extensive experiments on cross-modal and two extra uni-modal retrieval tasks (image-text retrieval, person re-identification, fine-grained image retrieval) have validated its superiority and flexibility over various popular retrieval frameworks. More importantly, we further discover that it can be seamlessly incorporated into multiple application scenarios, and demonstrates promising prospects from Attention Mechanism to Knowledge Distillation in a plug-and-play manner. Our code is publicly available at: https://github.com/Paranioar/GSSF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15266v1-abstract-full').style.display = 'none'; document.getElementById('2410.15266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures, Accepted by TIP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14210">arXiv:2410.14210</a> <span> [<a href="https://arxiv.org/pdf/2410.14210">pdf</a>, <a href="https://arxiv.org/format/2410.14210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Shape Transformation Driven by Active Contour for Class-Imbalanced Semi-Supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuliang Gu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yepeng Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhichao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinchi Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongchao Xu</a>, <a href="/search/cs?searchtype=author&query=Najman%2C+L">Laurent Najman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14210v1-abstract-short" style="display: inline;"> Annotating 3D medical images demands expert knowledge and is time-consuming. As a result, semi-supervised learning (SSL) approaches have gained significant interest in 3D medical image segmentation. The significant size differences among various organs in the human body lead to imbalanced class distribution, which is a major challenge in the real-world application of these SSL approaches. To addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14210v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14210v1-abstract-full" style="display: none;"> Annotating 3D medical images demands expert knowledge and is time-consuming. As a result, semi-supervised learning (SSL) approaches have gained significant interest in 3D medical image segmentation. The significant size differences among various organs in the human body lead to imbalanced class distribution, which is a major challenge in the real-world application of these SSL approaches. To address this issue, we develop a novel Shape Transformation driven by Active Contour (STAC), that enlarges smaller organs to alleviate imbalanced class distribution across different organs. Inspired by curve evolution theory in active contour methods, STAC employs a signed distance function (SDF) as the level set function, to implicitly represent the shape of organs, and deforms voxels in the direction of the steepest descent of SDF (i.e., the normal vector). To ensure that the voxels far from expansion organs remain unchanged, we design an SDF-based weight function to control the degree of deformation for each voxel. We then use STAC as a data-augmentation process during the training stage. Experimental results on two benchmark datasets demonstrate that the proposed method significantly outperforms some state-of-the-art methods. Source code is publicly available at https://github.com/GuGuLL123/STAC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14210v1-abstract-full').style.display = 'none'; document.getElementById('2410.14210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), Dec 2024, Lisbon (Portugal), Portugal </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12896">arXiv:2410.12896</a> <span> [<a href="https://arxiv.org/pdf/2410.12896">pdf</a>, <a href="https://arxiv.org/format/2410.12896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Data Synthesis and Augmentation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiahui Zhu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Minjie Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeming Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiwei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongye Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenkai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoyu Wu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Q">Qiqi Zhan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12896v1-abstract-short" style="display: inline;"> The success of Large Language Models (LLMs) is inherently linked to the availability of vast, diverse, and high-quality data for training and evaluation. However, the growth rate of high-quality data is significantly outpaced by the expansion of training datasets, leading to a looming data exhaustion crisis. This underscores the urgent need to enhance data efficiency and explore new data sources.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12896v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12896v1-abstract-full" style="display: none;"> The success of Large Language Models (LLMs) is inherently linked to the availability of vast, diverse, and high-quality data for training and evaluation. However, the growth rate of high-quality data is significantly outpaced by the expansion of training datasets, leading to a looming data exhaustion crisis. This underscores the urgent need to enhance data efficiency and explore new data sources. In this context, synthetic data has emerged as a promising solution. Currently, data generation primarily consists of two major approaches: data augmentation and synthesis. This paper comprehensively reviews and summarizes data generation techniques throughout the lifecycle of LLMs, including data preparation, pre-training, fine-tuning, instruction-tuning, preference alignment, and applications. Furthermore, We discuss the current constraints faced by these methods and investigate potential pathways for future development and research. Our aspiration is to equip researchers with a clear understanding of these methodologies, enabling them to swiftly identify appropriate data generation strategies in the construction of LLMs, while providing valuable insights for future exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12896v1-abstract-full').style.display = 'none'; document.getElementById('2410.12896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12757">arXiv:2410.12757</a> <span> [<a href="https://arxiv.org/pdf/2410.12757">pdf</a>, <a href="https://arxiv.org/format/2410.12757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> StyleDistance: Stronger Content-Independent Style Embeddings with Synthetic Parallel Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Patel%2C+A">Ajay Patel</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiacheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Justin Qiu</a>, <a href="/search/cs?searchtype=author&query=Horvitz%2C+Z">Zachary Horvitz</a>, <a href="/search/cs?searchtype=author&query=Apidianaki%2C+M">Marianna Apidianaki</a>, <a href="/search/cs?searchtype=author&query=McKeown%2C+K">Kathleen McKeown</a>, <a href="/search/cs?searchtype=author&query=Callison-Burch%2C+C">Chris Callison-Burch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12757v1-abstract-short" style="display: inline;"> Style representations aim to embed texts with similar writing styles closely and texts with different styles far apart, regardless of content. However, the contrastive triplets often used for training these representations may vary in both style and content, leading to potential content leakage in the representations. We introduce StyleDistance, a novel approach to training stronger content-indepe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12757v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12757v1-abstract-full" style="display: none;"> Style representations aim to embed texts with similar writing styles closely and texts with different styles far apart, regardless of content. However, the contrastive triplets often used for training these representations may vary in both style and content, leading to potential content leakage in the representations. We introduce StyleDistance, a novel approach to training stronger content-independent style embeddings. We use a large language model to create a synthetic dataset of near-exact paraphrases with controlled style variations, and produce positive and negative examples across 40 distinct style features for precise contrastive learning. We assess the quality of our synthetic data and embeddings through human and automatic evaluations. StyleDistance enhances the content-independence of style embeddings, which generalize to real-world benchmarks and outperform leading style representations in downstream applications. Our model can be found at https://huggingface.co/StyleDistance/styledistance . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12757v1-abstract-full').style.display = 'none'; document.getElementById('2410.12757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12316">arXiv:2410.12316</a> <span> [<a href="https://arxiv.org/pdf/2410.12316">pdf</a>, <a href="https://arxiv.org/format/2410.12316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> TPFL: A Trustworthy Personalized Federated Learning Framework via Subjective Logic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinqian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12316v1-abstract-short" style="display: inline;"> Federated learning (FL) enables collaborative model training across distributed clients while preserving data privacy. Despite its widespread adoption, most FL approaches focusing solely on privacy protection fall short in scenarios where trustworthiness is crucial, necessitating advancements in secure training, dependable decision-making mechanisms, robustness on corruptions, and enhanced perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12316v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12316v1-abstract-full" style="display: none;"> Federated learning (FL) enables collaborative model training across distributed clients while preserving data privacy. Despite its widespread adoption, most FL approaches focusing solely on privacy protection fall short in scenarios where trustworthiness is crucial, necessitating advancements in secure training, dependable decision-making mechanisms, robustness on corruptions, and enhanced performance with Non-IID data. To bridge this gap, we introduce Trustworthy Personalized Federated Learning (TPFL) framework designed for classification tasks via subjective logic in this paper. Specifically, TPFL adopts a unique approach by employing subjective logic to construct federated models, providing probabilistic decisions coupled with an assessment of uncertainty rather than mere probability assignments. By incorporating a trainable heterogeneity prior to the local training phase, TPFL effectively mitigates the adverse effects of data heterogeneity. Model uncertainty and instance uncertainty are further utilized to ensure the safety and reliability of the training and inference stages. Through extensive experiments on widely recognized federated learning benchmarks, we demonstrate that TPFL not only achieves competitive performance compared with advanced methods but also exhibits resilience against prevalent malicious attacks, robustness on domain shifts, and reliability in high-stake scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12316v1-abstract-full').style.display = 'none'; document.getElementById('2410.12316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 Pages with Appendix</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12269">arXiv:2410.12269</a> <span> [<a href="https://arxiv.org/pdf/2410.12269">pdf</a>, <a href="https://arxiv.org/format/2410.12269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoD-Loc: Aerial Visual Localization using LoD 3D Map with Neural Wireframe Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Juelin Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shen Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Long Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Maojun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12269v1-abstract-short" style="display: inline;"> We propose a new method named LoD-Loc for visual localization in the air. Unlike existing localization algorithms, LoD-Loc does not rely on complex 3D representations and can estimate the pose of an Unmanned Aerial Vehicle (UAV) using a Level-of-Detail (LoD) 3D map. LoD-Loc mainly achieves this goal by aligning the wireframe derived from the LoD projected model with that predicted by the neural ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12269v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12269v1-abstract-full" style="display: none;"> We propose a new method named LoD-Loc for visual localization in the air. Unlike existing localization algorithms, LoD-Loc does not rely on complex 3D representations and can estimate the pose of an Unmanned Aerial Vehicle (UAV) using a Level-of-Detail (LoD) 3D map. LoD-Loc mainly achieves this goal by aligning the wireframe derived from the LoD projected model with that predicted by the neural network. Specifically, given a coarse pose provided by the UAV sensor, LoD-Loc hierarchically builds a cost volume for uniformly sampled pose hypotheses to describe pose probability distribution and select a pose with maximum probability. Each cost within this volume measures the degree of line alignment between projected and predicted wireframes. LoD-Loc also devises a 6-DoF pose optimization algorithm to refine the previous result with a differentiable Gaussian-Newton method. As no public dataset exists for the studied problem, we collect two datasets with map levels of LoD3.0 and LoD2.0, along with real RGB queries and ground-truth pose annotations. We benchmark our method and demonstrate that LoD-Loc achieves excellent performance, even surpassing current state-of-the-art methods that use textured 3D models for localization. The code and dataset are available at https://victorzoo.github.io/LoD-Loc.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12269v1-abstract-full').style.display = 'none'; document.getElementById('2410.12269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024; for Project page, see https://victorzoo.github.io/LoD-Loc.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11989">arXiv:2410.11989</a> <span> [<a href="https://arxiv.org/pdf/2410.11989">pdf</a>, <a href="https://arxiv.org/format/2410.11989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Open-Vocabulary 3D Scene Graphs for Long-term Language-Guided Mobile Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shufei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zuoxu Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lixiu Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lijiang Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jihong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11989v3-abstract-short" style="display: inline;"> Enabling mobile robots to perform long-term tasks in dynamic real-world environments is a formidable challenge, especially when the environment changes frequently due to human-robot interactions or the robot's own actions. Traditional methods typically assume static scenes, which limits their applicability in the continuously changing real world. To overcome these limitations, we present DovSG, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11989v3-abstract-full').style.display = 'inline'; document.getElementById('2410.11989v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11989v3-abstract-full" style="display: none;"> Enabling mobile robots to perform long-term tasks in dynamic real-world environments is a formidable challenge, especially when the environment changes frequently due to human-robot interactions or the robot's own actions. Traditional methods typically assume static scenes, which limits their applicability in the continuously changing real world. To overcome these limitations, we present DovSG, a novel mobile manipulation framework that leverages dynamic open-vocabulary 3D scene graphs and a language-guided task planning module for long-term task execution. DovSG takes RGB-D sequences as input and utilizes vision-language models (VLMs) for object detection to obtain high-level object semantic features. Based on the segmented objects, a structured 3D scene graph is generated for low-level spatial relationships. Furthermore, an efficient mechanism for locally updating the scene graph, allows the robot to adjust parts of the graph dynamically during interactions without the need for full scene reconstruction. This mechanism is particularly valuable in dynamic environments, enabling the robot to continually adapt to scene changes and effectively support the execution of long-term tasks. We validated our system in real-world environments with varying degrees of manual modifications, demonstrating its effectiveness and superior performance in long-term tasks. Our project page is available at: https://bjhyzj.github.io/dovsg-web. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11989v3-abstract-full').style.display = 'none'; document.getElementById('2410.11989v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11772">arXiv:2410.11772</a> <span> [<a href="https://arxiv.org/pdf/2410.11772">pdf</a>, <a href="https://arxiv.org/format/2410.11772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Layer-wise Importance Matters: Less Memory for Better Performance in Parameter-efficient Fine-tuning of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+K">Kai Yao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Penglei Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lichun Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11772v2-abstract-short" style="display: inline;"> Parameter-Efficient Fine-Tuning (PEFT) methods have gained significant popularity for adapting pre-trained Large Language Models (LLMs) to downstream tasks, primarily due to their potential to significantly reduce memory and computational overheads. However, a common limitation in most PEFT approaches is their application of a uniform architectural design across all layers. This uniformity involve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11772v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11772v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11772v2-abstract-full" style="display: none;"> Parameter-Efficient Fine-Tuning (PEFT) methods have gained significant popularity for adapting pre-trained Large Language Models (LLMs) to downstream tasks, primarily due to their potential to significantly reduce memory and computational overheads. However, a common limitation in most PEFT approaches is their application of a uniform architectural design across all layers. This uniformity involves identical trainable modules and ignores the varying importance of each layer, leading to sub-optimal fine-tuning results. To overcome the above limitation and obtain better performance, we develop a novel approach, Importance-aware Sparse Tuning (IST), to fully utilize the inherent sparsity and select the most important subset of full layers with effective layer-wise importance scoring. The proposed IST is a versatile and plug-and-play technique compatible with various PEFT methods that operate on a per-layer basis. By leveraging the estimated importance scores, IST dynamically updates these selected layers in PEFT modules, leading to reduced memory demands. We further provide theoretical proof of convergence and empirical evidence of superior performance to demonstrate the advantages of IST over uniform updating strategies. Extensive experiments on a range of LLMs, PEFTs, and downstream tasks substantiate the effectiveness of our proposed method, showcasing IST's capacity to enhance existing layer-based PEFT methods. Our code is available at https://github.com/Kaiseem/IST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11772v2-abstract-full').style.display = 'none'; document.getElementById('2410.11772v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11224">arXiv:2410.11224</a> <span> [<a href="https://arxiv.org/pdf/2410.11224">pdf</a>, <a href="https://arxiv.org/format/2410.11224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeltaDock: A Unified Framework for Accurate, Efficient, and Physically Reliable Molecular Docking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiaxian Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaixi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jintao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+J">Jianfeng Pei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11224v2-abstract-short" style="display: inline;"> Molecular docking, a technique for predicting ligand binding poses, is crucial in structure-based drug design for understanding protein-ligand interactions. Recent advancements in docking methods, particularly those leveraging geometric deep learning (GDL), have demonstrated significant efficiency and accuracy advantages over traditional sampling methods. Despite these advancements, current method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11224v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11224v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11224v2-abstract-full" style="display: none;"> Molecular docking, a technique for predicting ligand binding poses, is crucial in structure-based drug design for understanding protein-ligand interactions. Recent advancements in docking methods, particularly those leveraging geometric deep learning (GDL), have demonstrated significant efficiency and accuracy advantages over traditional sampling methods. Despite these advancements, current methods are often tailored for specific docking settings, and limitations such as the neglect of protein side-chain structures, difficulties in handling large binding pockets, and challenges in predicting physically valid structures exist. To accommodate various docking settings and achieve accurate, efficient, and physically reliable docking, we propose a novel two-stage docking framework, DeltaDock, consisting of pocket prediction and site-specific docking. We innovatively reframe the pocket prediction task as a pocket-ligand alignment problem rather than direct prediction in the first stage. Then we follow a bi-level coarse-to-fine iterative refinement process to perform site-specific docking. Comprehensive experiments demonstrate the superior performance of DeltaDock. Notably, in the blind docking setting, DeltaDock achieves a 31\% relative improvement over the docking success rate compared with the previous state-of-the-art GDL model. With the consideration of physical validity, this improvement increases to about 300\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11224v2-abstract-full').style.display = 'none'; document.getElementById('2410.11224v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10289">arXiv:2410.10289</a> <span> [<a href="https://arxiv.org/pdf/2410.10289">pdf</a>, <a href="https://arxiv.org/format/2410.10289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fine-grained Abnormality Prompt Learning for Zero-shot Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawen Zhu</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+Y">Yew-Soon Ong</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guansong Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10289v1-abstract-short" style="display: inline;"> Current zero-shot anomaly detection (ZSAD) methods show remarkable success in prompting large pre-trained vision-language models to detect anomalies in a target dataset without using any dataset-specific training or demonstration. However, these methods are often focused on crafting/learning prompts that capture only coarse-grained semantics of abnormality, e.g., high-level semantics like "damaged… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10289v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10289v1-abstract-full" style="display: none;"> Current zero-shot anomaly detection (ZSAD) methods show remarkable success in prompting large pre-trained vision-language models to detect anomalies in a target dataset without using any dataset-specific training or demonstration. However, these methods are often focused on crafting/learning prompts that capture only coarse-grained semantics of abnormality, e.g., high-level semantics like "damaged", "imperfect", or "defective" on carpet. They therefore have limited capability in recognizing diverse abnormality details with distinctive visual appearance, e.g., specific defect types like color stains, cuts, holes, and threads on carpet. To address this limitation, we propose FAPrompt, a novel framework designed to learn Fine-grained Abnormality Prompts for more accurate ZSAD. To this end, we introduce a novel compound abnormality prompting module in FAPrompt to learn a set of complementary, decomposed abnormality prompts, where each abnormality prompt is formed by a compound of shared normal tokens and a few learnable abnormal tokens. On the other hand, the fine-grained abnormality patterns can be very different from one dataset to another. To enhance their cross-dataset generalization, we further introduce a data-dependent abnormality prior module that learns to derive abnormality features from each query/test image as a sample-wise abnormality prior to ground the abnormality prompts in a given target dataset. Comprehensive experiments conducted across 19 real-world datasets, covering both industrial defects and medical anomalies, demonstrate that FAPrompt substantially outperforms state-of-the-art methods by at least 3%-5% AUC/AP in both image- and pixel-level ZSAD tasks. Code is available at https://github.com/mala-lab/FAPrompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10289v1-abstract-full').style.display = 'none'; document.getElementById('2410.10289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10238">arXiv:2410.10238</a> <span> [<a href="https://arxiv.org/pdf/2410.10238">pdf</a>, <a href="https://arxiv.org/format/2410.10238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ForgeryGPT: Multimodal Large Language Model For Explainable Image Forgery Detection and Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fanrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaying Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+E">Esther Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10238v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs), such as GPT4o, have shown strong capabilities in visual reasoning and explanation generation. However, despite these strengths, they face significant challenges in the increasingly critical task of Image Forgery Detection and Localization (IFDL). Moreover, existing IFDL methods are typically limited to the learning of low-level semantic-agnostic clues and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10238v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10238v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs), such as GPT4o, have shown strong capabilities in visual reasoning and explanation generation. However, despite these strengths, they face significant challenges in the increasingly critical task of Image Forgery Detection and Localization (IFDL). Moreover, existing IFDL methods are typically limited to the learning of low-level semantic-agnostic clues and merely provide a single outcome judgment. To tackle these issues, we propose ForgeryGPT, a novel framework that advances the IFDL task by capturing high-order forensics knowledge correlations of forged images from diverse linguistic feature spaces, while enabling explainable generation and interactive dialogue through a newly customized Large Language Model (LLM) architecture. Specifically, ForgeryGPT enhances traditional LLMs by integrating the Mask-Aware Forgery Extractor, which enables the excavating of precise forgery mask information from input images and facilitating pixel-level understanding of tampering artifacts. The Mask-Aware Forgery Extractor consists of a Forgery Localization Expert (FL-Expert) and a Mask Encoder, where the FL-Expert is augmented with an Object-agnostic Forgery Prompt and a Vocabulary-enhanced Vision Encoder, allowing for effectively capturing of multi-scale fine-grained forgery details. To enhance its performance, we implement a three-stage training strategy, supported by our designed Mask-Text Alignment and IFDL Task-Specific Instruction Tuning datasets, which align vision-language modalities and improve forgery detection and instruction-following capabilities. Extensive experiments demonstrate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10238v1-abstract-full').style.display = 'none'; document.getElementById('2410.10238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09671">arXiv:2410.09671</a> <span> [<a href="https://arxiv.org/pdf/2410.09671">pdf</a>, <a href="https://arxiv.org/format/2410.09671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OpenR: An Open Source Framework for Advanced Reasoning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Muning Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiachen Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Anjie Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Ziqin Gong</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+L+M">Lionel M. Ni</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Ying Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09671v1-abstract-short" style="display: inline;"> In this technical report, we introduce OpenR, an open-source framework designed to integrate key components for enhancing the reasoning capabilities of large language models (LLMs). OpenR unifies data acquisition, reinforcement learning training (both online and offline), and non-autoregressive decoding into a cohesive software platform. Our goal is to establish an open-source platform and communi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09671v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09671v1-abstract-full" style="display: none;"> In this technical report, we introduce OpenR, an open-source framework designed to integrate key components for enhancing the reasoning capabilities of large language models (LLMs). OpenR unifies data acquisition, reinforcement learning training (both online and offline), and non-autoregressive decoding into a cohesive software platform. Our goal is to establish an open-source platform and community to accelerate the development of LLM reasoning. Inspired by the success of OpenAI's o1 model, which demonstrated improved reasoning abilities through step-by-step reasoning and reinforcement learning, OpenR integrates test-time compute, reinforcement learning, and process supervision to improve reasoning in LLMs. Our work is the first to provide an open-source framework that explores the core techniques of OpenAI's o1 model with reinforcement learning, achieving advanced reasoning capabilities beyond traditional autoregressive methods. We demonstrate the efficacy of OpenR by evaluating it on the MATH dataset, utilising publicly available data and search methods. Our initial experiments confirm substantial gains, with relative improvements in reasoning and performance driven by test-time computation and reinforcement learning through process reward models. The OpenR framework, including code, models, and datasets, is accessible at https://openreasoner.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09671v1-abstract-full').style.display = 'none'; document.getElementById('2410.09671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09347">arXiv:2410.09347</a> <span> [<a href="https://arxiv.org/pdf/2410.09347">pdf</a>, <a href="https://arxiv.org/format/2410.09347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Toward Guidance-Free AR Visual Generation via Condition Contrastive Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peize Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09347v1-abstract-short" style="display: inline;"> Classifier-Free Guidance (CFG) is a critical technique for enhancing the sample quality of visual generative models. However, in autoregressive (AR) multi-modal generation, CFG introduces design inconsistencies between language and visual content, contradicting the design philosophy of unifying different modalities for visual AR. Motivated by language model alignment methods, we propose \textit{Co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09347v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09347v1-abstract-full" style="display: none;"> Classifier-Free Guidance (CFG) is a critical technique for enhancing the sample quality of visual generative models. However, in autoregressive (AR) multi-modal generation, CFG introduces design inconsistencies between language and visual content, contradicting the design philosophy of unifying different modalities for visual AR. Motivated by language model alignment methods, we propose \textit{Condition Contrastive Alignment} (CCA) to facilitate guidance-free AR visual generation with high performance and analyze its theoretical connection with guided sampling methods. Unlike guidance methods that alter the sampling process to achieve the ideal sampling distribution, CCA directly fine-tunes pretrained models to fit the same distribution target. Experimental results show that CCA can significantly enhance the guidance-free performance of all tested models with just one epoch of fine-tuning ($\sim$ 1\% of pretraining epochs) on the pretraining dataset, on par with guided sampling methods. This largely removes the need for guided sampling in AR visual generation and cuts the sampling cost by half. Moreover, by adjusting training parameters, CCA can achieve trade-offs between sample diversity and fidelity similar to CFG. This experimentally confirms the strong theoretical connection between language-targeted alignment and visual-targeted guidance methods, unifying two previously independent research fields. Code and model weights: https://github.com/thu-ml/CCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09347v1-abstract-full').style.display = 'none'; document.getElementById('2410.09347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07864">arXiv:2410.07864</a> <span> [<a href="https://arxiv.org/pdf/2410.07864">pdf</a>, <a href="https://arxiv.org/format/2410.07864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songming Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lingxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bangguo Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hengkai Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07864v1-abstract-short" style="display: inline;"> Bimanual manipulation is essential in robotics, yet developing foundation models is extremely challenging due to the inherent complexity of coordinating two robot arms (leading to multi-modal action distributions) and the scarcity of training data. In this paper, we present the Robotics Diffusion Transformer (RDT), a pioneering diffusion foundation model for bimanual manipulation. RDT builds on di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07864v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07864v1-abstract-full" style="display: none;"> Bimanual manipulation is essential in robotics, yet developing foundation models is extremely challenging due to the inherent complexity of coordinating two robot arms (leading to multi-modal action distributions) and the scarcity of training data. In this paper, we present the Robotics Diffusion Transformer (RDT), a pioneering diffusion foundation model for bimanual manipulation. RDT builds on diffusion models to effectively represent multi-modality, with innovative designs of a scalable Transformer to deal with the heterogeneity of multi-modal inputs and to capture the nonlinearity and high frequency of robotic data. To address data scarcity, we further introduce a Physically Interpretable Unified Action Space, which can unify the action representations of various robots while preserving the physical meanings of original actions, facilitating learning transferrable physical knowledge. With these designs, we managed to pre-train RDT on the largest collection of multi-robot datasets to date and scaled it up to 1.2B parameters, which is the largest diffusion-based foundation model for robotic manipulation. We finally fine-tuned RDT on a self-created multi-task bimanual dataset with over 6K+ episodes to refine its manipulation capabilities. Experiments on real robots demonstrate that RDT significantly outperforms existing methods. It exhibits zero-shot generalization to unseen objects and scenes, understands and follows language instructions, learns new skills with just 1~5 demonstrations, and effectively handles complex, dexterous tasks. We refer to https://rdt-robotics.github.io/rdt-robotics/ for the code and videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07864v1-abstract-full').style.display = 'none'; document.getElementById('2410.07864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07783">arXiv:2410.07783</a> <span> [<a href="https://arxiv.org/pdf/2410.07783">pdf</a>, <a href="https://arxiv.org/format/2410.07783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIP Multi-modal Hashing for Multimedia Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jian Zhu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+M">Mingkai Sheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhangmin Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jingfei Chang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinling Jiang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+J">Jian Long</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07783v1-abstract-short" style="display: inline;"> Multi-modal hashing methods are widely used in multimedia retrieval, which can fuse multi-source data to generate binary hash code. However, the individual backbone networks have limited feature expression capabilities and are not jointly pre-trained on large-scale unsupervised multi-modal data, resulting in low retrieval accuracy. To address this issue, we propose a novel CLIP Multi-modal Hashing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07783v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07783v1-abstract-full" style="display: none;"> Multi-modal hashing methods are widely used in multimedia retrieval, which can fuse multi-source data to generate binary hash code. However, the individual backbone networks have limited feature expression capabilities and are not jointly pre-trained on large-scale unsupervised multi-modal data, resulting in low retrieval accuracy. To address this issue, we propose a novel CLIP Multi-modal Hashing (CLIPMH) method. Our method employs the CLIP framework to extract both text and vision features and then fuses them to generate hash code. Due to enhancement on each modal feature, our method has great improvement in the retrieval performance of multi-modal hashing methods. Compared with state-of-the-art unsupervised and supervised multi-modal hashing methods, experiments reveal that the proposed CLIPMH can significantly improve performance (a maximum increase of 8.38% in mAP). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07783v1-abstract-full').style.display = 'none'; document.getElementById('2410.07783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 31st International Conference on MultiMedia Modeling (MMM2025)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository