CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 356 results for author: <span class="mathjax">Lin, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lin%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lin, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lin%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lin, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18088">arXiv:2411.18088</a> <span> [<a href="https://arxiv.org/pdf/2411.18088">pdf</a>, <a href="https://arxiv.org/format/2411.18088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> There are More Fish in the Sea: Automated Vulnerability Repair via Binary Templates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liqian Chen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xiaoguang Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18088v1-abstract-short" style="display: inline;"> As software vulnerabilities increase in both volume and complexity, vendors often struggle to repair them promptly. Automated vulnerability repair has emerged as a promising solution to reduce the burden of manual debugging and fixing activities. However, existing techniques exclusively focus on repairing the vulnerabilities at the source code level, which has various limitations. For example, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18088v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18088v1-abstract-full" style="display: none;"> As software vulnerabilities increase in both volume and complexity, vendors often struggle to repair them promptly. Automated vulnerability repair has emerged as a promising solution to reduce the burden of manual debugging and fixing activities. However, existing techniques exclusively focus on repairing the vulnerabilities at the source code level, which has various limitations. For example, they are not applicable to those (e.g., users or security analysts) who do not have access to the source code. Consequently, this restricts the practical application of these techniques, especially in cases where vendors are unable to provide timely patches. In this paper, we aim to address the above limitations by performing vulnerability repair at binary code level, and accordingly propose a template-based automated vulnerability repair approach for Java binaries. Built on top of the literature, we collect fix templates from both existing template-based automated program repair approaches and vulnerability-specific analyses, which are then implemented for the Java binaries. Our systematic application of these templates effectively mitigates vulnerabilities: experiments on the Vul4J dataset demonstrate that TemVUR successfully repairs 11 vulnerabilities, marking a notable 57.1% improvement over current repair techniques. Moreover, TemVUR securely fixes 66.7% more vulnerabilities compared to leading techniques (15 vs. 9), underscoring its effectiveness in mitigating the risks posed by these vulnerabilities. To assess the generalizability of TemVUR, we curate the ManyVuls4J dataset, which goes beyond Vul4J to encompass a wider diversity of vulnerabilities. With 30% more vulnerabilities than its predecessor (increasing from 79 to 103). The evaluation on ManyVuls4J reaffirms TemVUR's effectiveness and generalizability across a diverse set of real-world vulnerabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18088v1-abstract-full').style.display = 'none'; document.getElementById('2411.18088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17459">arXiv:2411.17459</a> <span> [<a href="https://arxiv.org/pdf/2411.17459">pdf</a>, <a href="https://arxiv.org/format/2411.17459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WF-VAE: Enhancing Video VAE by Wavelet-Driven Energy Flow for Latent Video Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongjian Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yang Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17459v1-abstract-short" style="display: inline;"> Video Variational Autoencoder (VAE) encodes videos into a low-dimensional latent space, becoming a key component of most Latent Video Diffusion Models (LVDMs) to reduce model training costs. However, as the resolution and duration of generated videos increase, the encoding cost of Video VAEs becomes a limiting bottleneck in training LVDMs. Moreover, the block-wise inference method adopted by most… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17459v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17459v1-abstract-full" style="display: none;"> Video Variational Autoencoder (VAE) encodes videos into a low-dimensional latent space, becoming a key component of most Latent Video Diffusion Models (LVDMs) to reduce model training costs. However, as the resolution and duration of generated videos increase, the encoding cost of Video VAEs becomes a limiting bottleneck in training LVDMs. Moreover, the block-wise inference method adopted by most LVDMs can lead to discontinuities of latent space when processing long-duration videos. The key to addressing the computational bottleneck lies in decomposing videos into distinct components and efficiently encoding the critical information. Wavelet transform can decompose videos into multiple frequency-domain components and improve the efficiency significantly, we thus propose Wavelet Flow VAE (WF-VAE), an autoencoder that leverages multi-level wavelet transform to facilitate low-frequency energy flow into latent representation. Furthermore, we introduce a method called Causal Cache, which maintains the integrity of latent space during block-wise inference. Compared to state-of-the-art video VAEs, WF-VAE demonstrates superior performance in both PSNR and LPIPS metrics, achieving 2x higher throughput and 4x lower memory consumption while maintaining competitive reconstruction quality. Our code and models are available at https://github.com/PKU-YuanGroup/WF-VAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17459v1-abstract-full').style.display = 'none'; document.getElementById('2411.17459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17451">arXiv:2411.17451</a> <span> [<a href="https://arxiv.org/pdf/2411.17451">pdf</a>, <a href="https://arxiv.org/format/2411.17451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VLRewardBench: A Challenging Benchmark for Vision-Language Generative Reward Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuancheng Wei</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhihui Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuqing Yang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=An%2C+C">Chenxin An</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17451v1-abstract-short" style="display: inline;"> Vision-language generative reward models (VL-GenRMs) play a crucial role in aligning and evaluating multimodal AI systems, yet their own evaluation remains under-explored. Current assessment methods primarily rely on AI-annotated preference labels from traditional VL tasks, which can introduce biases and often fail to effectively challenge state-of-the-art models. To address these limitations, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17451v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17451v1-abstract-full" style="display: none;"> Vision-language generative reward models (VL-GenRMs) play a crucial role in aligning and evaluating multimodal AI systems, yet their own evaluation remains under-explored. Current assessment methods primarily rely on AI-annotated preference labels from traditional VL tasks, which can introduce biases and often fail to effectively challenge state-of-the-art models. To address these limitations, we introduce VL-RewardBench, a comprehensive benchmark spanning general multimodal queries, visual hallucination detection, and complex reasoning tasks. Through our AI-assisted annotation pipeline combining sample selection with human verification, we curate 1,250 high-quality examples specifically designed to probe model limitations. Comprehensive evaluation across 16 leading large vision-language models, demonstrates VL-RewardBench's effectiveness as a challenging testbed, where even GPT-4o achieves only 65.4% accuracy, and state-of-the-art open-source models such as Qwen2-VL-72B, struggle to surpass random-guessing. Importantly, performance on VL-RewardBench strongly correlates (Pearson's r > 0.9) with MMMU-Pro accuracy using Best-of-N sampling with VL-GenRMs. Analysis experiments uncover three critical insights for improving VL-GenRMs: (i) models predominantly fail at basic visual perception tasks rather than reasoning tasks; (ii) inference-time scaling benefits vary dramatically by model capacity; and (iii) training VL-GenRMs to learn to judge substantially boosts judgment capability (+14.7% accuracy for a 7B VL-GenRM). We believe VL-RewardBench along with the experimental insights will become a valuable resource for advancing VL-GenRMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17451v1-abstract-full').style.display = 'none'; document.getElementById('2411.17451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://vl-rewardbench.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17323">arXiv:2411.17323</a> <span> [<a href="https://arxiv.org/pdf/2411.17323">pdf</a>, <a href="https://arxiv.org/format/2411.17323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InsightEdit: Towards Better Instruction Following for Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yingjing Xu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+J">Jie Kong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiazhi Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xiao Pan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17323v1-abstract-short" style="display: inline;"> In this paper, we focus on the task of instruction-based image editing. Previous works like InstructPix2Pix, InstructDiffusion, and SmartEdit have explored end-to-end editing. However, two limitations still remain: First, existing datasets suffer from low resolution, poor background consistency, and overly simplistic instructions. Second, current approaches mainly condition on the text while the r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17323v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17323v1-abstract-full" style="display: none;"> In this paper, we focus on the task of instruction-based image editing. Previous works like InstructPix2Pix, InstructDiffusion, and SmartEdit have explored end-to-end editing. However, two limitations still remain: First, existing datasets suffer from low resolution, poor background consistency, and overly simplistic instructions. Second, current approaches mainly condition on the text while the rich image information is underexplored, therefore inferior in complex instruction following and maintaining background consistency. Targeting these issues, we first curated the AdvancedEdit dataset using a novel data construction pipeline, formulating a large-scale dataset with high visual quality, complex instructions, and good background consistency. Then, to further inject the rich image information, we introduce a two-stream bridging mechanism utilizing both the textual and visual features reasoned by the powerful Multimodal Large Language Models (MLLM) to guide the image editing process more precisely. Extensive results demonstrate that our approach, InsightEdit, achieves state-of-the-art performance, excelling in complex instruction following and maintaining high background consistency with the original image. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17323v1-abstract-full').style.display = 'none'; document.getElementById('2411.17323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17230">arXiv:2411.17230</a> <span> [<a href="https://arxiv.org/pdf/2411.17230">pdf</a>, <a href="https://arxiv.org/format/2411.17230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Fault Localization from the Semantic Code Search Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yihao Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yan Lei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liqian Chen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xiaoguang Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17230v1-abstract-short" style="display: inline;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17230v1-abstract-full" style="display: none;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization (FL), which identifies code entities responsible for bugs within the software project to boost software debugging. These two tasks exhibit similarities since they both address search problems. Notably, CS techniques have demonstrated greater effectiveness than FL ones, possibly because of the precise semantic details of the required code offered by natural language queries, which are not readily accessible to FL methods. Drawing inspiration from this, we hypothesize that a fault localizer could achieve greater proficiency if semantic information about the buggy methods were made available. Based on this idea, we propose CosFL, an FL approach that decomposes the FL task into two steps: query generation, which describes the functionality of the problematic code in natural language, and fault retrieval, which uses CS to find program elements semantically related to the query. Specifically, to depict the buggy functionalities and generate high-quality queries, CosFL extensively harnesses the code analysis, semantic comprehension, and decision-making capabilities of LLMs. Moreover, to enhance the accuracy of CS, CosFL captures varying levels of context information and employs a multi-granularity code search strategy, which facilitates a more precise identification of buggy methods from a holistic view. The evaluation on 835 real bugs from 23 Java projects shows that CosFL successfully localizes 324 bugs within Top-1, which significantly outperforms the state-of-the-art approaches by 26.6%-57.3%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'none'; document.getElementById('2411.17230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07391">arXiv:2411.07391</a> <span> [<a href="https://arxiv.org/pdf/2411.07391">pdf</a>, <a href="https://arxiv.org/format/2411.07391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning Client Pruning for Noisy Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Morafah%2C+M">Mahdi Morafah</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hojin Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bill Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07391v1-abstract-short" style="display: inline;"> Federated Learning (FL) enables collaborative model training across decentralized edge devices while preserving data privacy. However, existing FL methods often assume clean annotated datasets, impractical for resource-constrained edge devices. In reality, noisy labels are prevalent, posing significant challenges to FL performance. Prior approaches attempt label correction and robust training tech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07391v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07391v1-abstract-full" style="display: none;"> Federated Learning (FL) enables collaborative model training across decentralized edge devices while preserving data privacy. However, existing FL methods often assume clean annotated datasets, impractical for resource-constrained edge devices. In reality, noisy labels are prevalent, posing significant challenges to FL performance. Prior approaches attempt label correction and robust training techniques but exhibit limited efficacy, particularly under high noise levels. This paper introduces ClipFL (Federated Learning Client Pruning), a novel framework addressing noisy labels from a fresh perspective. ClipFL identifies and excludes noisy clients based on their performance on a clean validation dataset, tracked using a Noise Candidacy Score (NCS). The framework comprises three phases: pre-client pruning to identify potential noisy clients and calculate their NCS, client pruning to exclude a percentage of clients with the highest NCS, and post-client pruning for fine-tuning the global model with standard FL on clean clients. Empirical evaluation demonstrates ClipFL's efficacy across diverse datasets and noise levels, achieving accurate noisy client identification, superior performance, faster convergence, and reduced communication costs compared to state-of-the-art FL methods. Our code is available at https://github.com/MMorafah/ClipFL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07391v1-abstract-full').style.display = 'none'; document.getElementById('2411.07391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Transactions on Modeling and Performance Evaluation of Computing Systems, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07133">arXiv:2411.07133</a> <span> [<a href="https://arxiv.org/pdf/2411.07133">pdf</a>, <a href="https://arxiv.org/format/2411.07133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Stronger Models are NOT Stronger Teachers for Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhangchen Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fengqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+L">Luyao Niu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Poovendran%2C+R">Radha Poovendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07133v2-abstract-short" style="display: inline;"> Instruction tuning has been widely adopted to ensure large language models (LLMs) follow user instructions effectively. The resulting instruction-following capabilities of LLMs heavily rely on the instruction datasets used for tuning. Recently, synthetic instruction datasets have emerged as an economically viable solution to provide LLMs diverse and high-quality instructions. However, existing app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07133v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07133v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07133v2-abstract-full" style="display: none;"> Instruction tuning has been widely adopted to ensure large language models (LLMs) follow user instructions effectively. The resulting instruction-following capabilities of LLMs heavily rely on the instruction datasets used for tuning. Recently, synthetic instruction datasets have emerged as an economically viable solution to provide LLMs diverse and high-quality instructions. However, existing approaches typically assume that larger or stronger models are stronger teachers for instruction tuning, and hence simply adopt these models as response generators to the synthetic instructions. In this paper, we challenge this commonly-adopted assumption. Our extensive experiments across five base models and twenty response generators reveal that larger and stronger models are not necessarily stronger teachers of smaller models. We refer to this phenomenon as the Larger Models' Paradox. We observe that existing metrics cannot precisely predict the effectiveness of response generators since they ignore the compatibility between teachers and base models being fine-tuned. We thus develop a novel metric, named as Compatibility-Adjusted Reward (CAR) to measure the effectiveness of response generators. Our experiments across five base models demonstrate that CAR outperforms almost all baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07133v2-abstract-full').style.display = 'none'; document.getElementById('2411.07133v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06175">arXiv:2411.06175</a> <span> [<a href="https://arxiv.org/pdf/2411.06175">pdf</a>, <a href="https://arxiv.org/format/2411.06175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Clustering Algorithms and RAG Enhancing Semi-Supervised Text Classification with Large LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shan Zhong</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jiahao Zeng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yongxin Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bohong Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06175v1-abstract-short" style="display: inline;"> This paper introduces an innovative semi-supervised learning approach for text classification, addressing the challenge of abundant data but limited labeled examples. Our methodology integrates few-shot learning with retrieval-augmented generation (RAG) and conventional statistical clustering, enabling effective learning from a minimal number of labeled instances while generating high-quality labe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06175v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06175v1-abstract-full" style="display: none;"> This paper introduces an innovative semi-supervised learning approach for text classification, addressing the challenge of abundant data but limited labeled examples. Our methodology integrates few-shot learning with retrieval-augmented generation (RAG) and conventional statistical clustering, enabling effective learning from a minimal number of labeled instances while generating high-quality labeled data. To the best of our knowledge, we are the first to incorporate RAG alongside clustering in text data generation. Our experiments on the Reuters and Web of Science datasets demonstrate state-of-the-art performance, with few-shot augmented data alone producing results nearly equivalent to those achieved with fully labeled datasets. Notably, accuracies of 95.41\% and 82.43\% were achieved for complex text document classification tasks, where the number of categories can exceed 100. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06175v1-abstract-full').style.display = 'none'; document.getElementById('2411.06175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02461">arXiv:2411.02461</a> <span> [<a href="https://arxiv.org/pdf/2411.02461">pdf</a>, <a href="https://arxiv.org/format/2411.02461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multiple Dimensions of Trustworthiness in LLMs via Sparse Activation Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yuxin Xiao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Chaoqun Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xu Shen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02461v1-abstract-short" style="display: inline;"> As the development and application of Large Language Models (LLMs) continue to advance rapidly, enhancing their trustworthiness and aligning them with human preferences has become a critical area of research. Traditional methods rely heavily on extensive data for Reinforcement Learning from Human Feedback (RLHF), but representation engineering offers a new, training-free approach. This technique l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02461v1-abstract-full" style="display: none;"> As the development and application of Large Language Models (LLMs) continue to advance rapidly, enhancing their trustworthiness and aligning them with human preferences has become a critical area of research. Traditional methods rely heavily on extensive data for Reinforcement Learning from Human Feedback (RLHF), but representation engineering offers a new, training-free approach. This technique leverages semantic features to control the representation of LLM's intermediate hidden states, enabling the model to meet specific requirements such as increased honesty or heightened safety awareness. However, a significant challenge arises when attempting to fulfill multiple requirements simultaneously. It proves difficult to encode various semantic contents, like honesty and safety, into a singular semantic feature, restricting its practicality. In this work, we address this issue through ``Sparse Activation Control''. By delving into the intrinsic mechanisms of LLMs, we manage to identify and pinpoint components that are closely related to specific tasks within the model, i.e., attention heads. These heads display sparse characteristics that allow for near-independent control over different tasks. Our experiments, conducted on the open-source Llama series models, have yielded encouraging results. The models were able to align with human preferences on issues of safety, factuality, and bias concurrently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02461v1-abstract-full').style.display = 'none'; document.getElementById('2411.02461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23166">arXiv:2410.23166</a> <span> [<a href="https://arxiv.org/pdf/2410.23166">pdf</a>, <a href="https://arxiv.org/format/2410.23166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SciPIP: An LLM-based Scientific Paper Idea Proposer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lihui Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liye Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunxiang Luo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yi Dai</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23166v1-abstract-short" style="display: inline;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23166v1-abstract-full" style="display: none;"> The exponential growth of knowledge and the increasing complexity of interdisciplinary research pose significant challenges for researchers, including information overload and difficulties in exploring novel ideas. The advancements in large language models (LLMs), such as GPT-4, have shown great potential in enhancing idea proposals, but how to effectively utilize large models for reasonable idea proposal has not been thoroughly explored. This paper proposes a scientific paper idea proposer (SciPIP). Based on a user-provided research background, SciPIP retrieves helpful papers from a literature database while leveraging the capabilities of LLMs to generate more novel and feasible ideas. To this end, 1) we construct a literature retrieval database, extracting lots of papers' multi-dimension information for fast access. Then, a literature retrieval method based on semantics, entity, and citation co-occurrences is proposed to search relevant literature from multiple aspects based on the user-provided background. 2) After literature retrieval, we introduce dual-path idea proposal strategies, where one path infers solutions from the retrieved literature and the other path generates original ideas through model brainstorming. We then combine the two to achieve a good balance between feasibility and originality. Through extensive experiments on the natural language processing (NLP) field, we demonstrate that SciPIP can retrieve citations similar to those of existing top conference papers and generate many ideas consistent with them. Additionally, we evaluate the originality of other ideas generated by SciPIP using large language models, further validating the effectiveness of our proposed method. The code and the database are released at https://github.com/cheerss/SciPIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23166v1-abstract-full').style.display = 'none'; document.getElementById('2410.23166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 5 figures, 19 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23123">arXiv:2410.23123</a> <span> [<a href="https://arxiv.org/pdf/2410.23123">pdf</a>, <a href="https://arxiv.org/format/2410.23123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> On Memorization of Large Language Models in Logical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chulin Xie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Da Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyun Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Ghazi%2C+B">Badih Ghazi</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+R">Ravi Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23123v1-abstract-short" style="display: inline;"> Large language models (LLMs) achieve good performance on challenging reasoning benchmarks, yet could also make basic reasoning mistakes. This contrasting behavior is puzzling when it comes to understanding the mechanisms behind LLMs' reasoning capabilities. One hypothesis is that the increasingly high and nearly saturated performance on common reasoning benchmarks could be due to the memorization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23123v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23123v1-abstract-full" style="display: none;"> Large language models (LLMs) achieve good performance on challenging reasoning benchmarks, yet could also make basic reasoning mistakes. This contrasting behavior is puzzling when it comes to understanding the mechanisms behind LLMs' reasoning capabilities. One hypothesis is that the increasingly high and nearly saturated performance on common reasoning benchmarks could be due to the memorization of similar problems. In this paper, we systematically investigate this hypothesis with a quantitative measurement of memorization in reasoning tasks, using a dynamically generated logical reasoning benchmark based on Knights and Knaves (K&K) puzzles. We found that LLMs could interpolate the training puzzles (achieving near-perfect accuracy) after fine-tuning, yet fail when those puzzles are slightly perturbed, suggesting that the models heavily rely on memorization to solve those training puzzles. On the other hand, we show that while fine-tuning leads to heavy memorization, it also consistently improves generalization performance. In-depth analyses with perturbation tests, cross difficulty-level transferability, probing model internals, and fine-tuning with wrong answers suggest that the LLMs learn to reason on K&K puzzles despite training data memorization. This phenomenon indicates that LLMs exhibit a complex interplay between memorization and genuine reasoning abilities. Finally, our analysis with per-sample memorization score sheds light on how LLMs switch between reasoning and memorization in solving logical puzzles. Our code and data are available at https://memkklogic.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23123v1-abstract-full').style.display = 'none'; document.getElementById('2410.23123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18808">arXiv:2410.18808</a> <span> [<a href="https://arxiv.org/pdf/2410.18808">pdf</a>, <a href="https://arxiv.org/format/2410.18808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Delving into the Reversal Curse: How Far Can Large Language Models Generalize? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhengkai Lin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhihang Fu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Deng Cai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18808v2-abstract-short" style="display: inline;"> While large language models (LLMs) showcase unprecedented capabilities, they also exhibit certain inherent limitations when facing seemingly trivial tasks. A prime example is the recently debated "reversal curse", which surfaces when models, having been trained on the fact "A is B", struggle to generalize this knowledge to infer that "B is A". In this paper, we examine the manifestation of the rev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18808v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18808v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18808v2-abstract-full" style="display: none;"> While large language models (LLMs) showcase unprecedented capabilities, they also exhibit certain inherent limitations when facing seemingly trivial tasks. A prime example is the recently debated "reversal curse", which surfaces when models, having been trained on the fact "A is B", struggle to generalize this knowledge to infer that "B is A". In this paper, we examine the manifestation of the reversal curse across various tasks and delve into both the generalization abilities and the problem-solving mechanisms of LLMs. This investigation leads to a series of significant insights: (1) LLMs are able to generalize to "B is A" when both A and B are presented in the context as in the case of a multiple-choice question. (2) This generalization ability is highly correlated to the structure of the fact "A is B" in the training documents. For example, this generalization only applies to biographies structured in "[Name] is [Description]" but not to "[Description] is [Name]". (3) We propose and verify the hypothesis that LLMs possess an inherent bias in fact recalling during knowledge application, which explains and underscores the importance of the document structure to successful learning. (4) The negative impact of this bias on the downstream performance of LLMs can hardly be mitigated through training alone. These findings offer a novel perspective on interpreting LLMs' generalization through their intrinsic mechanisms and provide insights for developing more effective learning methods. Our code and data are available at https://github.com/alibaba/thinking_bias.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18808v2-abstract-full').style.display = 'none'; document.getElementById('2410.18808v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024. Our code and data are available at https://github.com/alibaba/thinking_bias.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14731">arXiv:2410.14731</a> <span> [<a href="https://arxiv.org/pdf/2410.14731">pdf</a>, <a href="https://arxiv.org/format/2410.14731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MatryoshkaKV: Adaptive KV Compression via Trainable Orthogonal Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bokai Lin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zihao Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zipeng Xiao</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+S">Siqi Kou</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+T">Tianqi Hou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhijie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14731v1-abstract-short" style="display: inline;"> KV cache has become a de facto technique for the inference of large language models (LLMs), where tensors of shape (layer number, head number, sequence length, feature dimension) are introduced to cache historical information for self-attention. As the size of the model and data grows, the KV cache can quickly become a bottleneck within the system in both storage and memory transfer. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14731v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14731v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14731v1-abstract-full" style="display: none;"> KV cache has become a de facto technique for the inference of large language models (LLMs), where tensors of shape (layer number, head number, sequence length, feature dimension) are introduced to cache historical information for self-attention. As the size of the model and data grows, the KV cache can quickly become a bottleneck within the system in both storage and memory transfer. To address this, prior studies usually focus on the first three axes of the cache tensors for compression. This paper supplements them, focusing on the feature dimension axis, by utilizing low-rank projection matrices to transform the cache features into spaces with reduced dimensions. We begin by investigating the canonical orthogonal projection method for data compression through principal component analysis (PCA). We observe the issue with PCA projection where significant performance degradation is observed at low compression rates. To bridge the gap, we propose to directly tune the orthogonal projection matrices with a distillation objective using an elaborate Matryoshka training strategy. After training, we adaptively search for the optimal compression rates for various layers and heads given varying compression budgets. Compared to previous works, our method can easily embrace pre-trained LLMs and hold a smooth tradeoff between performance and compression rate. We empirically witness the high data efficiency of our training procedure and find that our method can sustain over 90% performance with an average KV cache compression rate of 60% (and up to 75% in certain extreme scenarios) for popular LLMs like LLaMA2-7B-base and Mistral-7B-v0.3-base. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14731v1-abstract-full').style.display = 'none'; document.getElementById('2410.14731v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14283">arXiv:2410.14283</a> <span> [<a href="https://arxiv.org/pdf/2410.14283">pdf</a>, <a href="https://arxiv.org/format/2410.14283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Takin-ADA: Emotion Controllable Audio-Driven Animation with Canonical and Landmark Loss Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yanzhen Yu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+R">Ruitao Lv</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruoye Xie</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Pan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongbin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14283v1-abstract-short" style="display: inline;"> Existing audio-driven facial animation methods face critical challenges, including expression leakage, ineffective subtle expression transfer, and imprecise audio-driven synchronization. We discovered that these issues stem from limitations in motion representation and the lack of fine-grained control over facial expressions. To address these problems, we present Takin-ADA, a novel two-stage appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14283v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14283v1-abstract-full" style="display: none;"> Existing audio-driven facial animation methods face critical challenges, including expression leakage, ineffective subtle expression transfer, and imprecise audio-driven synchronization. We discovered that these issues stem from limitations in motion representation and the lack of fine-grained control over facial expressions. To address these problems, we present Takin-ADA, a novel two-stage approach for real-time audio-driven portrait animation. In the first stage, we introduce a specialized loss function that enhances subtle expression transfer while reducing unwanted expression leakage. The second stage utilizes an advanced audio processing technique to improve lip-sync accuracy. Our method not only generates precise lip movements but also allows flexible control over facial expressions and head motions. Takin-ADA achieves high-resolution (512x512) facial animations at up to 42 FPS on an RTX 4090 GPU, outperforming existing commercial solutions. Extensive experiments demonstrate that our model significantly surpasses previous methods in video quality, facial dynamics realism, and natural head movements, setting a new benchmark in the field of audio-driven facial animation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14283v1-abstract-full').style.display = 'none'; document.getElementById('2410.14283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12876">arXiv:2410.12876</a> <span> [<a href="https://arxiv.org/pdf/2410.12876">pdf</a>, <a href="https://arxiv.org/format/2410.12876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> In-context KV-Cache Eviction for LLMs via Attention-Gate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zihao Zeng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bokai Lin</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+T">Tianqi Hou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhijie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12876v2-abstract-short" style="display: inline;"> The KV-Cache technique has become the standard for the inference of large language models (LLMs). It caches states of self-attention to avoid recomputation. Yet, it is widely criticized that KV-Cache can become a bottleneck of the LLM inference system, especially when confronted with ultra-large models and long-context queries. A natural remedy is to discard the KV-Cache for less important tokens,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12876v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12876v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12876v2-abstract-full" style="display: none;"> The KV-Cache technique has become the standard for the inference of large language models (LLMs). It caches states of self-attention to avoid recomputation. Yet, it is widely criticized that KV-Cache can become a bottleneck of the LLM inference system, especially when confronted with ultra-large models and long-context queries. A natural remedy is to discard the KV-Cache for less important tokens, with StreamingLLM as an example, but the used static eviction strategies cannot flexibly adapt to varying contexts. Remedies like H2O leverage accumulative attention scores to perform dynamic eviction but suffer from the attention bias issue in capturing contextual information. This paper bridges this gap by devising a parameterized KV-Cache eviction mechanism, dubbed as Attention-Gate, which accepts the whole context as input and yields eviction flags for each token to realize in-context eviction. The subsequent self-attention module proceeds according to the flags and only the KV states for the remaining tokens need to be cached. The Attention-Gates can vary among different heads and layers and be trivially plugged into pre-trained LLMs, tuned by cost-effective continual pre-training or supervised fine-tuning objectives to acquire what to discard. The computational and memory overhead introduced by Attention-Gates is minimal. Our method is validated across multiple tasks, demonstrating both efficiency and adaptability. After a highly efficient continual pre-training, it achieves higher average accuracy and evicts more tokens compared to traditional training-free methods. In supervised fine-tuning, it not only evicts many tokens but also outperforms LoRA-finetuned LLMs on some datasets, such as RTE, where it improves accuracy by 13.9% while evicting 62.8% of tokens, showing that effective eviction of redundant tokens can even enhance performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12876v2-abstract-full').style.display = 'none'; document.getElementById('2410.12876v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11758">arXiv:2410.11758</a> <span> [<a href="https://arxiv.org/pdf/2410.11758">pdf</a>, <a href="https://arxiv.org/format/2410.11758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Latent Action Pretraining from Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+B">Byeongguk Jeon</a>, <a href="/search/cs?searchtype=author&query=Joo%2C+S">Sejune Joo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Baolin Peng</a>, <a href="/search/cs?searchtype=author&query=Mandlekar%2C+A">Ajay Mandlekar</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+R">Reuben Tan</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+Y">Yu-Wei Chao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Liden%2C+L">Lars Liden</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11758v1-abstract-short" style="display: inline;"> We introduce Latent Action Pretraining for general Action models (LAPA), an unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11758v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11758v1-abstract-full" style="display: none;"> We introduce Latent Action Pretraining for general Action models (LAPA), an unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a method to learn from internet-scale videos that do not have robot action labels. We first train an action quantization model leveraging VQ-VAE-based objective to learn discrete latent actions between image frames, then pretrain a latent VLA model to predict these latent actions from observations and task descriptions, and finally finetune the VLA on small-scale robot manipulation data to map from latent to robot actions. Experimental results demonstrate that our method significantly outperforms existing techniques that train robot manipulation policies from large-scale videos. Furthermore, it outperforms the state-of-the-art VLA model trained with robotic action labels on real-world manipulation tasks that require language conditioning, generalization to unseen objects, and semantic generalization to unseen instructions. Training only on human manipulation videos also shows positive transfer, opening up the potential for leveraging web-scale data for robotics foundation model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11758v1-abstract-full').style.display = 'none'; document.getElementById('2410.11758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://latentactionpretraining.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10815">arXiv:2410.10815</a> <span> [<a href="https://arxiv.org/pdf/2410.10815">pdf</a>, <a href="https://arxiv.org/format/2410.10815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Depth Any Video with Scalable Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Honghui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10815v1-abstract-short" style="display: inline;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yieldin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10815v1-abstract-full" style="display: none;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yielding 40,000 video clips of 5-second duration, each with precise depth annotations. Second, we leverage the powerful priors of generative video diffusion models to handle real-world videos effectively, integrating advanced techniques such as rotary position encoding and flow matching to further enhance flexibility and efficiency. Unlike previous models, which are limited to fixed-length video sequences, our approach introduces a novel mixed-duration training strategy that handles videos of varying lengths and performs robustly across different frame rates-even on single frames. At inference, we propose a depth interpolation method that enables our model to infer high-resolution video depth across sequences of up to 150 frames. Our model outperforms all previous generative depth models in terms of spatial accuracy and temporal consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'none'; document.getElementById('2410.10815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://depthanyvideo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10394">arXiv:2410.10394</a> <span> [<a href="https://arxiv.org/pdf/2410.10394">pdf</a>, <a href="https://arxiv.org/format/2410.10394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PIVOT-R: Primitive-Driven Waypoint-Aware World Model for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pengzhen Ren</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bingqian Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junfan Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shikui Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10394v2-abstract-short" style="display: inline;"> Language-guided robotic manipulation is a challenging task that requires an embodied agent to follow abstract user instructions to accomplish various complex manipulation tasks. Previous work trivially fitting the data without revealing the relation between instruction and low-level executable actions, these models are prone to memorizing the surficial pattern of the data instead of acquiring the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10394v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10394v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10394v2-abstract-full" style="display: none;"> Language-guided robotic manipulation is a challenging task that requires an embodied agent to follow abstract user instructions to accomplish various complex manipulation tasks. Previous work trivially fitting the data without revealing the relation between instruction and low-level executable actions, these models are prone to memorizing the surficial pattern of the data instead of acquiring the transferable knowledge, and thus are fragile to dynamic environment changes. To address this issue, we propose a PrIrmitive-driVen waypOinT-aware world model for Robotic manipulation (PIVOT-R) that focuses solely on the prediction of task-relevant waypoints. Specifically, PIVOT-R consists of a Waypoint-aware World Model (WAWM) and a lightweight action prediction module. The former performs primitive action parsing and primitive-driven waypoint prediction, while the latter focuses on decoding low-level actions. Additionally, we also design an asynchronous hierarchical executor (AHE), which can use different execution frequencies for different modules of the model, thereby helping the model reduce computational redundancy and improve model execution efficiency. Our PIVOT-R outperforms state-of-the-art (SoTA) open-source models on the SeaWave benchmark, achieving an average relative improvement of 19.45% across four levels of instruction tasks. Moreover, compared to the synchronously executed PIVOT-R, the execution efficiency of PIVOT-R with AHE is increased by 28-fold, with only a 2.9% drop in performance. These results provide compelling evidence that our PIVOT-R can significantly improve both the performance and efficiency of robotic manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10394v2-abstract-full').style.display = 'none'; document.getElementById('2410.10394v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09772">arXiv:2410.09772</a> <span> [<a href="https://arxiv.org/pdf/2410.09772">pdf</a>, <a href="https://arxiv.org/format/2410.09772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HypomimiaCoach: An AU-based Digital Therapy System for Hypomimia Detection & Rehabilitation with Parkinson's Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yingjing Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xueyan Cai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihong Zhou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mengru Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengke Li</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+C">Chentian Weng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Cheng Yao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09772v1-abstract-short" style="display: inline;"> Hypomimia is a non-motor symptom of Parkinson's disease that manifests as delayed facial movements and expressions, along with challenges in articulation and emotion. Currently, subjective evaluation by neurologists is the primary method for hypomimia detection, and conventional rehabilitation approaches heavily rely on verbal prompts from rehabilitation physicians. There remains a deficiency in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09772v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09772v1-abstract-full" style="display: none;"> Hypomimia is a non-motor symptom of Parkinson's disease that manifests as delayed facial movements and expressions, along with challenges in articulation and emotion. Currently, subjective evaluation by neurologists is the primary method for hypomimia detection, and conventional rehabilitation approaches heavily rely on verbal prompts from rehabilitation physicians. There remains a deficiency in accessible, user-friendly and scientifically rigorous assistive tools for hypomimia treatments. To investigate this, we developed HypomimaCoach, an Action Unit (AU)-based digital therapy system for hypomimia detection and rehabilitation in Parkinson's disease. The HypomimaCoach system was designed to facilitate engagement through the incorporation of both relaxed and controlled rehabilitation exercises, while also stimulating initiative through the integration of digital therapies that incorporated traditional face training methods. We extract action unit(AU) features and their relationship for hypomimia detection. In order to facilitate rehabilitation, a series of training programmes have been devised based on the Action Units (AUs) and patients are provided with real-time feedback through an additional AU recognition model, which guides them through their training routines. A pilot study was conducted with seven participants in China, all of whom exhibited symptoms of Parkinson's disease hypomimia. The results of the pilot study demonstrated a positive impact on participants' self-efficacy, with favourable feedback received. Furthermore, physician evaluations validated the system's applicability in a therapeutic setting for patients with Parkinson's disease, as well as its potential value in clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09772v1-abstract-full').style.display = 'none'; document.getElementById('2410.09772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06671">arXiv:2410.06671</a> <span> [<a href="https://arxiv.org/pdf/2410.06671">pdf</a>, <a href="https://arxiv.org/format/2410.06671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GLA-DA: Global-Local Alignment Domain Adaptation for Multivariate Time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+G">Gang Tu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bingxin Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S">See-Kiong Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06671v1-abstract-short" style="display: inline;"> Unlike images and natural language tokens, time series data is highly semantically sparse, resulting in labor-intensive label annotations. Unsupervised and Semi-supervised Domain Adaptation (UDA and SSDA) have demonstrated efficiency in addressing this issue by utilizing pre-labeled source data to train on unlabeled or partially labeled target data. However, in domain adaptation methods designed f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06671v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06671v1-abstract-full" style="display: none;"> Unlike images and natural language tokens, time series data is highly semantically sparse, resulting in labor-intensive label annotations. Unsupervised and Semi-supervised Domain Adaptation (UDA and SSDA) have demonstrated efficiency in addressing this issue by utilizing pre-labeled source data to train on unlabeled or partially labeled target data. However, in domain adaptation methods designed for downstream classification tasks, directly adapting labeled source samples with unlabelled target samples often results in similar distributions across various classes, thereby compromising the performance of the target classification task. To tackle this challenge, we proposed a Global-Local Alignment Domain Adaptation (GLA-DA) method for multivariate time series data. Data from two domains were initially encoded to align in an intermediate feature space adversarially, achieving Global Feature Alignment (GFA). Subsequently, GLA-DA leveraged the consistency between similarity-based and deep learning-based models to assign pseudo labels to unlabeled target data. This process aims to preserve differences among data with distinct labels by aligning the samples with the same class labels together, achieving Local Class Alignment (LCA). We implemented GLA-DA in both UDA and SSDA scenarios, showcasing its superiority over state-of-the-art methods through extensive experiments on various public datasets. Ablation experiments underscored the significance of key components within GLA-DA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06671v1-abstract-full').style.display = 'none'; document.getElementById('2410.06671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02677">arXiv:2410.02677</a> <span> [<a href="https://arxiv.org/pdf/2410.02677">pdf</a>, <a href="https://arxiv.org/format/2410.02677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CulturalBench: a Robust, Diverse and Challenging Benchmark on Measuring the (Lack of) Cultural Knowledge of LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chiu%2C+Y+Y">Yu Ying Chiu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Liwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C+Y">Chan Young Park</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+S">Shuyue Stella Li</a>, <a href="/search/cs?searchtype=author&query=Ravi%2C+S">Sahithya Ravi</a>, <a href="/search/cs?searchtype=author&query=Bhatia%2C+M">Mehar Bhatia</a>, <a href="/search/cs?searchtype=author&query=Antoniak%2C+M">Maria Antoniak</a>, <a href="/search/cs?searchtype=author&query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&query=Shwartz%2C+V">Vered Shwartz</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02677v1-abstract-short" style="display: inline;"> To make large language models (LLMs) more helpful across diverse cultures, it is essential to have effective cultural knowledge benchmarks to measure and track our progress. Effective benchmarks need to be robust, diverse, and challenging. We introduce CulturalBench: a set of 1,227 human-written and human-verified questions for effectively assessing LLMs' cultural knowledge, covering 45 global reg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02677v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02677v1-abstract-full" style="display: none;"> To make large language models (LLMs) more helpful across diverse cultures, it is essential to have effective cultural knowledge benchmarks to measure and track our progress. Effective benchmarks need to be robust, diverse, and challenging. We introduce CulturalBench: a set of 1,227 human-written and human-verified questions for effectively assessing LLMs' cultural knowledge, covering 45 global regions including the underrepresented ones like Bangladesh, Zimbabwe, and Peru. Questions - each verified by five independent annotators - span 17 diverse topics ranging from food preferences to greeting etiquettes. We evaluate models on two setups: CulturalBench-Easy and CulturalBench-Hard which share the same questions but asked differently. We find that LLMs are sensitive to such difference in setups (e.g., GPT-4o with 27.3% difference). Compared to human performance (92.6% accuracy), CulturalBench-Hard is more challenging for frontier LLMs with the best performing model (GPT-4o) at only 61.5% and the worst (Llama3-8b) at 21.4%. Moreover, we find that LLMs often struggle with tricky questions that have multiple correct answers (e.g., What utensils do the Chinese usually use?), revealing a tendency to converge to a single answer. Our results also indicate that OpenAI GPT-4o substantially outperform other proprietary and open source models in questions related to all but one region (Oceania). Nonetheless, all models consistently underperform on questions related to South America and the Middle East. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02677v1-abstract-full').style.display = 'none'; document.getElementById('2410.02677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02566">arXiv:2410.02566</a> <span> [<a href="https://arxiv.org/pdf/2410.02566">pdf</a>, <a href="https://arxiv.org/format/2410.02566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Based Prediction of Suspension Dynamics Performance in Multi-Axle Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+K+C">Kai Chun Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo-Yi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02566v1-abstract-short" style="display: inline;"> This paper presents a deep learning-based framework for predicting the dynamic performance of suspension systems in multi-axle vehicles, emphasizing the integration of machine learning with traditional vehicle dynamics modeling. A Multi-Task Deep Belief Network Deep Neural Network (MTL-DBN-DNN) was developed to capture the relationships between key vehicle parameters and suspension performance met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02566v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02566v1-abstract-full" style="display: none;"> This paper presents a deep learning-based framework for predicting the dynamic performance of suspension systems in multi-axle vehicles, emphasizing the integration of machine learning with traditional vehicle dynamics modeling. A Multi-Task Deep Belief Network Deep Neural Network (MTL-DBN-DNN) was developed to capture the relationships between key vehicle parameters and suspension performance metrics. The model was trained on data generated from numerical simulations and demonstrated superior prediction accuracy compared to conventional DNN models. A comprehensive sensitivity analysis was conducted to assess the impact of various vehicle and suspension parameters on dynamic suspension performance. Additionally, the Suspension Dynamic Performance Index (SDPI) was introduced as a holistic measure to quantify overall suspension performance, accounting for the combined effects of multiple parameters. The findings highlight the effectiveness of multitask learning in improving predictive models for complex vehicle systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02566v1-abstract-full').style.display = 'none'; document.getElementById('2410.02566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01733">arXiv:2410.01733</a> <span> [<a href="https://arxiv.org/pdf/2410.01733">pdf</a>, <a href="https://arxiv.org/format/2410.01733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Visual Perception in Text Strings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Q">Qi Jia</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shanshan Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Ziheng Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yizhu Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yang You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01733v1-abstract-short" style="display: inline;"> Understanding visual semantics embedded in consecutive characters is a crucial capability for both large language models (LLMs) and multi-modal large language models (MLLMs). This type of artifact possesses the unique characteristic that identical information can be readily formulated in both texts and images, making them a significant proxy for analyzing modern LLMs' and MLLMs' capabilities in mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01733v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01733v1-abstract-full" style="display: none;"> Understanding visual semantics embedded in consecutive characters is a crucial capability for both large language models (LLMs) and multi-modal large language models (MLLMs). This type of artifact possesses the unique characteristic that identical information can be readily formulated in both texts and images, making them a significant proxy for analyzing modern LLMs' and MLLMs' capabilities in modality-agnostic vision understanding. In this work, we select ASCII art as a representative artifact, where the lines and brightness used to depict each concept are rendered by characters, and we frame the problem as an ASCII art recognition task. We benchmark model performance on this task by constructing an evaluation dataset with an elaborate categorization tree and also collect a training set to elicit the models' visual perception ability. Through a comprehensive analysis of dozens of models, results reveal that although humans can achieve nearly 100% accuracy, the state-of-the-art LLMs and MLLMs lag far behind. Models are capable of recognizing concepts depicted in the ASCII arts given only text inputs indicated by over 60% accuracy for some concepts, but most of them achieves merely around 30% accuracy when averaged across all categories. When provided with images as inputs, GPT-4o gets 82.68%, outperforming the strongest open-source MLLM by 21.95%. Although models favor different kinds of ASCII art depending on the modality provided, none of the MLLMs successfully benefit when both modalities are supplied simultaneously. Moreover, supervised fine-tuning helps improve models' accuracy especially when provided with the image modality, but also highlights the need for better training techniques to enhance the information fusion among modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01733v1-abstract-full').style.display = 'none'; document.getElementById('2410.01733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00897">arXiv:2410.00897</a> <span> [<a href="https://arxiv.org/pdf/2410.00897">pdf</a>, <a href="https://arxiv.org/format/2410.00897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Other Quantitative Biology">q-bio.OT</span> </div> </div> <p class="title is-5 mathjax"> The Gradient of Health Data Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Baihan Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00897v1-abstract-short" style="display: inline;"> In the era of digital health and artificial intelligence, the management of patient data privacy has become increasingly complex, with significant implications for global health equity and patient trust. This paper introduces a novel "privacy gradient" approach to health data governance, offering a more nuanced and adaptive framework than traditional binary privacy models. Our multidimensional con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00897v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00897v1-abstract-full" style="display: none;"> In the era of digital health and artificial intelligence, the management of patient data privacy has become increasingly complex, with significant implications for global health equity and patient trust. This paper introduces a novel "privacy gradient" approach to health data governance, offering a more nuanced and adaptive framework than traditional binary privacy models. Our multidimensional concept considers factors such as data sensitivity, stakeholder relationships, purpose of use, and temporal aspects, allowing for context-sensitive privacy protections. Through policy analyses, ethical considerations, and case studies spanning adolescent health, integrated care, and genomic research, we demonstrate how this approach can address critical privacy challenges in diverse healthcare settings worldwide. The privacy gradient model has the potential to enhance patient engagement, improve care coordination, and accelerate medical research while safeguarding individual privacy rights. We provide policy recommendations for implementing this approach, considering its impact on healthcare systems, research infrastructures, and global health initiatives. This work aims to inform policymakers, healthcare leaders, and digital health innovators, contributing to a more equitable, trustworthy, and effective global health data ecosystem in the digital age. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00897v1-abstract-full').style.display = 'none'; document.getElementById('2410.00897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19886">arXiv:2409.19886</a> <span> [<a href="https://arxiv.org/pdf/2409.19886">pdf</a>, <a href="https://arxiv.org/format/2409.19886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RouterDC: Query-Based Router by Dual Contrastive Learning for Assembling Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuhao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weisen Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Baijiong Lin</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James T. Kwok</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19886v1-abstract-short" style="display: inline;"> Recent works show that assembling multiple off-the-shelf large language models (LLMs) can harness their complementary abilities. To achieve this, routing is a promising method, which learns a router to select the most suitable LLM for each query. However, existing routing models are ineffective when multiple LLMs perform well for a query. To address this problem, in this paper, we propose a method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19886v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19886v1-abstract-full" style="display: none;"> Recent works show that assembling multiple off-the-shelf large language models (LLMs) can harness their complementary abilities. To achieve this, routing is a promising method, which learns a router to select the most suitable LLM for each query. However, existing routing models are ineffective when multiple LLMs perform well for a query. To address this problem, in this paper, we propose a method called query-based Router by Dual Contrastive learning (RouterDC). The RouterDC model consists of an encoder and LLM embeddings, and we propose two contrastive learning losses to train the RouterDC model. Experimental results show that RouterDC is effective in assembling LLMs and largely outperforms individual top-performing LLMs as well as existing routing methods on both in-distribution (+2.76\%) and out-of-distribution (+1.90\%) tasks. Source code is available at https://github.com/shuhao02/RouterDC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19886v1-abstract-full').style.display = 'none'; document.getElementById('2409.19886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18461">arXiv:2409.18461</a> <span> [<a href="https://arxiv.org/pdf/2409.18461">pdf</a>, <a href="https://arxiv.org/format/2409.18461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Towards Diverse Device Heterogeneous Federated Learning via Task Arithmetic Knowledge Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Morafah%2C+M">Mahdi Morafah</a>, <a href="/search/cs?searchtype=author&query=Kungurtsev%2C+V">Vyacheslav Kungurtsev</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hojin Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bill Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18461v2-abstract-short" style="display: inline;"> Federated Learning has emerged as a promising paradigm for collaborative machine learning, while preserving user data privacy. Despite its potential, standard FL lacks support for diverse heterogeneous device prototypes, which vary significantly in model and dataset sizes -- from small IoT devices to large workstations. This limitation is only partially addressed by existing knowledge distillation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18461v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18461v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18461v2-abstract-full" style="display: none;"> Federated Learning has emerged as a promising paradigm for collaborative machine learning, while preserving user data privacy. Despite its potential, standard FL lacks support for diverse heterogeneous device prototypes, which vary significantly in model and dataset sizes -- from small IoT devices to large workstations. This limitation is only partially addressed by existing knowledge distillation techniques, which often fail to transfer knowledge effectively across a broad spectrum of device prototypes with varied capabilities. This failure primarily stems from two issues: the dilution of informative logits from more capable devices by those from less capable ones, and the use of a single integrated logits as the distillation target across all devices, which neglects their individual learning capacities and and the unique contributions of each. To address these challenges, we introduce TAKFL, a novel KD-based framework that treats the knowledge transfer from each device prototype's ensemble as a separate task, independently distilling each to preserve its unique contributions and avoid dilution. TAKFL also incorporates a KD-based self-regularization technique to mitigate the issues related to the noisy and unsupervised ensemble distillation process. To integrate the separately distilled knowledge, we introduce an adaptive task arithmetic knowledge integration process, allowing each student model to customize the knowledge integration for optimal performance. Additionally, we present theoretical results demonstrating the effectiveness of task arithmetic in transferring knowledge across heterogeneous devices with varying capacities. Comprehensive evaluations of our method across both CV and NLP tasks demonstrate that TAKFL achieves SOTA results in a variety of datasets and settings, significantly outperforming existing KD-based methods Code is released at https://github.com/MMorafah/TAKFL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18461v2-abstract-full').style.display = 'none'; document.getElementById('2409.18461v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 38th Conference on Neural Information Processing Systems (NeurIPS 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16427">arXiv:2409.16427</a> <span> [<a href="https://arxiv.org/pdf/2409.16427">pdf</a>, <a href="https://arxiv.org/format/2409.16427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HAICOSYSTEM: An Ecosystem for Sandboxing Safety Risks in Human-AI Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xuhui Zhou</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Brahman%2C+F">Faeze Brahman</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Liwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Ximing Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Frank Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Mireshghallah%2C+N">Niloofar Mireshghallah</a>, <a href="/search/cs?searchtype=author&query=Bras%2C+R+L">Ronan Le Bras</a>, <a href="/search/cs?searchtype=author&query=Sap%2C+M">Maarten Sap</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16427v3-abstract-short" style="display: inline;"> AI agents are increasingly autonomous in their interactions with human users and tools, leading to increased interactional safety risks. We present HAICOSYSTEM, a framework examining AI agent safety within diverse and complex social interactions. HAICOSYSTEM features a modular sandbox environment that simulates multi-turn interactions between human users and AI agents, where the AI agents are equi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16427v3-abstract-full').style.display = 'inline'; document.getElementById('2409.16427v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16427v3-abstract-full" style="display: none;"> AI agents are increasingly autonomous in their interactions with human users and tools, leading to increased interactional safety risks. We present HAICOSYSTEM, a framework examining AI agent safety within diverse and complex social interactions. HAICOSYSTEM features a modular sandbox environment that simulates multi-turn interactions between human users and AI agents, where the AI agents are equipped with a variety of tools (e.g., patient management platforms) to navigate diverse scenarios (e.g., a user attempting to access other patients' profiles). To examine the safety of AI agents in these interactions, we develop a comprehensive multi-dimensional evaluation framework that uses metrics covering operational, content-related, societal, and legal risks. Through running 1840 simulations based on 92 scenarios across seven domains (e.g., healthcare, finance, education), we demonstrate that HAICOSYSTEM can emulate realistic user-AI interactions and complex tool use by AI agents. Our experiments show that state-of-the-art LLMs, both proprietary and open-sourced, exhibit safety risks in over 50\% cases, with models generally showing higher risks when interacting with simulated malicious users. Our findings highlight the ongoing challenge of building agents that can safely navigate complex interactions, particularly when faced with malicious users. To foster the AI agent safety ecosystem, we release a code platform that allows practitioners to create custom scenarios, simulate interactions, and evaluate the safety and performance of their agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16427v3-abstract-full').style.display = 'none'; document.getElementById('2409.16427v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Both the second and third authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14082">arXiv:2409.14082</a> <span> [<a href="https://arxiv.org/pdf/2409.14082">pdf</a>, <a href="https://arxiv.org/format/2409.14082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PTD-SQL: Partitioning and Targeted Drilling with LLMs in Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruilin Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binghuai Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zicheng Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14082v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have emerged as powerful tools for Text-to-SQL tasks, exhibiting remarkable reasoning capabilities. Different from tasks such as math word problems and commonsense reasoning, SQL solutions have a relatively fixed pattern. This facilitates the investigation of whether LLMs can benefit from categorical thinking, mirroring how humans acquire knowledge through inductive re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14082v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14082v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have emerged as powerful tools for Text-to-SQL tasks, exhibiting remarkable reasoning capabilities. Different from tasks such as math word problems and commonsense reasoning, SQL solutions have a relatively fixed pattern. This facilitates the investigation of whether LLMs can benefit from categorical thinking, mirroring how humans acquire knowledge through inductive reasoning based on comparable examples. In this study, we propose that employing query group partitioning allows LLMs to focus on learning the thought processes specific to a single problem type, consequently enhancing their reasoning abilities across diverse difficulty levels and problem categories. Our experiments reveal that multiple advanced LLMs, when equipped with PTD-SQL, can either surpass or match previous state-of-the-art (SOTA) methods on the Spider and BIRD datasets. Intriguingly, models with varying initial performances have exhibited significant improvements, mainly at the boundary of their capabilities after targeted drilling, suggesting a parallel with human progress. Code is available at https://github.com/lrlbbzl/PTD-SQL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14082v1-abstract-full').style.display = 'none'; document.getElementById('2409.14082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Main Conference. Revised by ARR April and ARR June. 32 pages, 7 figures and 30 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12139">arXiv:2409.12139</a> <span> [<a href="https://arxiv.org/pdf/2409.12139">pdf</a>, <a href="https://arxiv.org/format/2409.12139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Takin: A Cohort of Superior Quality Zero-shot Speech Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sijing Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuan Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Laipeng He</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tianwei He</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wendi He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yanni Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiting Lin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Pengfei Tan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chengwei Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruoye Xie</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Quanlei Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jingjing Yin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yanzhen Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huimin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guangcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+P">Pengpeng Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12139v3-abstract-short" style="display: inline;"> With the advent of the big data and large language model era, zero-shot personalized rapid customization has emerged as a significant trend. In this report, we introduce Takin AudioLLM, a series of techniques and models, mainly including Takin TTS, Takin VC, and Takin Morphing, specifically designed for audiobook production. These models are capable of zero-shot speech production, generating high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12139v3-abstract-full').style.display = 'inline'; document.getElementById('2409.12139v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12139v3-abstract-full" style="display: none;"> With the advent of the big data and large language model era, zero-shot personalized rapid customization has emerged as a significant trend. In this report, we introduce Takin AudioLLM, a series of techniques and models, mainly including Takin TTS, Takin VC, and Takin Morphing, specifically designed for audiobook production. These models are capable of zero-shot speech production, generating high-quality speech that is nearly indistinguishable from real human speech and facilitating individuals to customize the speech content according to their own needs. Specifically, we first introduce Takin TTS, a neural codec language model that builds upon an enhanced neural speech codec and a multi-task training framework, capable of generating high-fidelity natural speech in a zero-shot way. For Takin VC, we advocate an effective content and timbre joint modeling approach to improve the speaker similarity, while advocating for a conditional flow matching based decoder to further enhance its naturalness and expressiveness. Last, we propose the Takin Morphing system with highly decoupled and advanced timbre and prosody modeling approaches, which enables individuals to customize speech production with their preferred timbre and prosody in a precise and controllable manner. Extensive experiments validate the effectiveness and robustness of our Takin AudioLLM series models. For detailed demos, please refer to https://everest-ai.github.io/takinaudiollm/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12139v3-abstract-full').style.display = 'none'; document.getElementById('2409.12139v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report; 18 pages; typos corrected, references added, demo url modified, author name modified;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09755">arXiv:2409.09755</a> <span> [<a href="https://arxiv.org/pdf/2409.09755">pdf</a>, <a href="https://arxiv.org/format/2409.09755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Analysis of Centrifugal Clutches in Two-Speed Automatic Transmissions with Deep Learning-Based Engagement Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo-Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K+C">Kai Chun Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09755v2-abstract-short" style="display: inline;"> This paper presents a comprehensive numerical analysis of centrifugal clutch systems integrated with a two-speed automatic transmission, a key component in automotive torque transfer. Centrifugal clutches enable torque transmission based on rotational speed without external controls. The study systematically examines various clutch configurations effects on transmission dynamics, focusing on torqu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09755v2-abstract-full').style.display = 'inline'; document.getElementById('2409.09755v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09755v2-abstract-full" style="display: none;"> This paper presents a comprehensive numerical analysis of centrifugal clutch systems integrated with a two-speed automatic transmission, a key component in automotive torque transfer. Centrifugal clutches enable torque transmission based on rotational speed without external controls. The study systematically examines various clutch configurations effects on transmission dynamics, focusing on torque transfer, upshifting, and downshifting behaviors under different conditions. A Deep Neural Network (DNN) model predicts clutch engagement using parameters such as spring preload and shoe mass, offering an efficient alternative to complex simulations. The integration of deep learning and numerical modeling provides critical insights for optimizing clutch designs, enhancing transmission performance and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09755v2-abstract-full').style.display = 'none'; document.getElementById('2409.09755v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07641">arXiv:2409.07641</a> <span> [<a href="https://arxiv.org/pdf/2409.07641">pdf</a>, <a href="https://arxiv.org/ps/2409.07641">ps</a>, <a href="https://arxiv.org/format/2409.07641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SimulBench: Evaluating Language Models with Creative Simulation Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Q">Qi Jia</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07641v1-abstract-short" style="display: inline;"> We introduce SimulBench, a benchmark designed to evaluate large language models (LLMs) across a diverse collection of creative simulation scenarios, such as acting as a Linux terminal or playing text games with users. While these simulation tasks serve as effective measures of an LLM's general intelligence, they are seldom incorporated into existing benchmarks. A major challenge is to develop an e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07641v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07641v1-abstract-full" style="display: none;"> We introduce SimulBench, a benchmark designed to evaluate large language models (LLMs) across a diverse collection of creative simulation scenarios, such as acting as a Linux terminal or playing text games with users. While these simulation tasks serve as effective measures of an LLM's general intelligence, they are seldom incorporated into existing benchmarks. A major challenge is to develop an evaluation framework for testing different LLMs fairly while preserving the multi-round interactive nature of simulation tasks between users and AI. To tackle this issue, we suggest using a fixed LLM as a user agent to engage with an LLM to collect dialogues first under different tasks. Then, challenging dialogue scripts are extracted for evaluating different target LLMs. To facilitate automatic assessment on \DataName{}, GPT-4 is employed as the evaluator, tasked with reviewing the quality of the final response generated by the target LLMs given multi-turn dialogue scripts. Our comprehensive experiments indicate that these simulation tasks continue to pose a significant challenge with their unique natures and show the gap between proprietary models and the most advanced open LLMs. For example, GPT-4-turbo outperforms LLaMA-3-70b-Chat on 18.55\% more cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07641v1-abstract-full').style.display = 'none'; document.getElementById('2409.07641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://simulbench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04456">arXiv:2409.04456</a> <span> [<a href="https://arxiv.org/pdf/2409.04456">pdf</a>, <a href="https://arxiv.org/format/2409.04456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pattern based learning and optimisation through pricing for bin packing problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huayan Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+R">Ruibin Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tie-Yan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawei Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bingchen Lin</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jianfeng Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04456v1-abstract-short" style="display: inline;"> As a popular form of knowledge and experience, patterns and their identification have been critical tasks in most data mining applications. However, as far as we are aware, no study has systematically examined the dynamics of pattern values and their reuse under varying conditions. We argue that when problem conditions such as the distributions of random variables change, the patterns that perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04456v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04456v1-abstract-full" style="display: none;"> As a popular form of knowledge and experience, patterns and their identification have been critical tasks in most data mining applications. However, as far as we are aware, no study has systematically examined the dynamics of pattern values and their reuse under varying conditions. We argue that when problem conditions such as the distributions of random variables change, the patterns that performed well in previous circumstances may become less effective and adoption of these patterns would result in sub-optimal solutions. In response, we make a connection between data mining and the duality theory in operations research and propose a novel scheme to efficiently identify patterns and dynamically quantify their values for each specific condition. Our method quantifies the value of patterns based on their ability to satisfy stochastic constraints and their effects on the objective value, allowing high-quality patterns and their combinations to be detected. We use the online bin packing problem to evaluate the effectiveness of the proposed scheme and illustrate the online packing procedure with the guidance of patterns that address the inherent uncertainty of the problem. Results show that the proposed algorithm significantly outperforms the state-of-the-art methods. We also analysed in detail the distinctive features of the proposed methods that lead to performance improvement and the special cases where our method can be further improved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04456v1-abstract-full').style.display = 'none'; document.getElementById('2409.04456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01658">arXiv:2409.01658</a> <span> [<a href="https://arxiv.org/pdf/2409.01658">pdf</a>, <a href="https://arxiv.org/format/2409.01658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language Models with Pinpoint Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhen Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Houqiang Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xinmei Tian</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Deng Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xu Shen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01658v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) tend to prioritize adherence to user prompts over providing veracious responses, leading to the sycophancy issue. When challenged by users, LLMs tend to admit mistakes and provide inaccurate responses even if they initially provided the correct answer. Recent works propose to employ supervised fine-tuning (SFT) to mitigate the sycophancy issue, while it typically leads… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01658v2-abstract-full').style.display = 'inline'; document.getElementById('2409.01658v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01658v2-abstract-full" style="display: none;"> Large Language Models (LLMs) tend to prioritize adherence to user prompts over providing veracious responses, leading to the sycophancy issue. When challenged by users, LLMs tend to admit mistakes and provide inaccurate responses even if they initially provided the correct answer. Recent works propose to employ supervised fine-tuning (SFT) to mitigate the sycophancy issue, while it typically leads to the degeneration of LLMs' general capability. To address the challenge, we propose a novel supervised pinpoint tuning (SPT), where the region-of-interest modules are tuned for a given objective. Specifically, SPT first reveals and verifies a small percentage (<5%) of the basic modules, which significantly affect a particular behavior of LLMs. i.e., sycophancy. Subsequently, SPT merely fine-tunes these identified modules while freezing the rest. To verify the effectiveness of the proposed SPT, we conduct comprehensive experiments, demonstrating that SPT significantly mitigates the sycophancy issue of LLMs (even better than SFT). Moreover, SPT introduces limited or even no side effects on the general capability of LLMs. Our results shed light on how to precisely, effectively, and efficiently explain and improve the targeted ability of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01658v2-abstract-full').style.display = 'none'; document.getElementById('2409.01658v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01199">arXiv:2409.01199</a> <span> [<a href="https://arxiv.org/pdf/2409.01199">pdf</a>, <a href="https://arxiv.org/format/2409.01199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> OD-VAE: An Omni-dimensional Video Compressor for Improving Latent Video Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongjian Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xing Zhou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01199v2-abstract-short" style="display: inline;"> Variational Autoencoder (VAE), compressing videos into latent representations, is a crucial preceding component of Latent Video Diffusion Models (LVDMs). With the same reconstruction quality, the more sufficient the VAE's compression for videos is, the more efficient the LVDMs are. However, most LVDMs utilize 2D image VAE, whose compression for videos is only in the spatial dimension and often ign… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01199v2-abstract-full').style.display = 'inline'; document.getElementById('2409.01199v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01199v2-abstract-full" style="display: none;"> Variational Autoencoder (VAE), compressing videos into latent representations, is a crucial preceding component of Latent Video Diffusion Models (LVDMs). With the same reconstruction quality, the more sufficient the VAE's compression for videos is, the more efficient the LVDMs are. However, most LVDMs utilize 2D image VAE, whose compression for videos is only in the spatial dimension and often ignored in the temporal dimension. How to conduct temporal compression for videos in a VAE to obtain more concise latent representations while promising accurate reconstruction is seldom explored. To fill this gap, we propose an omni-dimension compression VAE, named OD-VAE, which can temporally and spatially compress videos. Although OD-VAE's more sufficient compression brings a great challenge to video reconstruction, it can still achieve high reconstructed accuracy by our fine design. To obtain a better trade-off between video reconstruction quality and compression speed, four variants of OD-VAE are introduced and analyzed. In addition, a novel tail initialization is designed to train OD-VAE more efficiently, and a novel inference strategy is proposed to enable OD-VAE to handle videos of arbitrary length with limited GPU memory. Comprehensive experiments on video reconstruction and LVDM-based video generation demonstrate the effectiveness and efficiency of our proposed methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01199v2-abstract-full').style.display = 'none'; document.getElementById('2409.01199v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/PKU-YuanGroup/Open-Sora-Plan</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15101">arXiv:2408.15101</a> <span> [<a href="https://arxiv.org/pdf/2408.15101">pdf</a>, <a href="https://arxiv.org/format/2408.15101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MTMamba++: Enhancing Multi-Task Dense Scene Understanding via Mamba-Based Decoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Baijiong Lin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weisen Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengguang Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15101v1-abstract-short" style="display: inline;"> Multi-task dense scene understanding, which trains a model for multiple dense prediction tasks, has a wide range of application scenarios. Capturing long-range dependency and enhancing cross-task interactions are crucial to multi-task dense prediction. In this paper, we propose MTMamba++, a novel architecture for multi-task scene understanding featuring with a Mamba-based decoder. It contains two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15101v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15101v1-abstract-full" style="display: none;"> Multi-task dense scene understanding, which trains a model for multiple dense prediction tasks, has a wide range of application scenarios. Capturing long-range dependency and enhancing cross-task interactions are crucial to multi-task dense prediction. In this paper, we propose MTMamba++, a novel architecture for multi-task scene understanding featuring with a Mamba-based decoder. It contains two types of core blocks: self-task Mamba (STM) block and cross-task Mamba (CTM) block. STM handles long-range dependency by leveraging state-space models, while CTM explicitly models task interactions to facilitate information exchange across tasks. We design two types of CTM block, namely F-CTM and S-CTM, to enhance cross-task interaction from feature and semantic perspectives, respectively. Experiments on NYUDv2, PASCAL-Context, and Cityscapes datasets demonstrate the superior performance of MTMamba++ over CNN-based and Transformer-based methods. The code is available at https://github.com/EnVision-Research/MTMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15101v1-abstract-full').style.display = 'none'; document.getElementById('2408.15101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2407.02228</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11948">arXiv:2408.11948</a> <span> [<a href="https://arxiv.org/pdf/2408.11948">pdf</a>, <a href="https://arxiv.org/format/2408.11948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Geometric Topology">math.GT</span> </div> </div> <p class="title is-5 mathjax"> Topological Representational Similarity Analysis in Brains and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Baihan Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11948v1-abstract-short" style="display: inline;"> Understanding how the brain represents and processes information is crucial for advancing neuroscience and artificial intelligence. Representational similarity analysis (RSA) has been instrumental in characterizing neural representations, but traditional RSA relies solely on geometric properties, overlooking crucial topological information. This thesis introduces Topological RSA (tRSA), a novel fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11948v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11948v1-abstract-full" style="display: none;"> Understanding how the brain represents and processes information is crucial for advancing neuroscience and artificial intelligence. Representational similarity analysis (RSA) has been instrumental in characterizing neural representations, but traditional RSA relies solely on geometric properties, overlooking crucial topological information. This thesis introduces Topological RSA (tRSA), a novel framework combining geometric and topological properties of neural representations. tRSA applies nonlinear monotonic transforms to representational dissimilarities, emphasizing local topology while retaining intermediate-scale geometry. The resulting geo-topological matrices enable model comparisons robust to noise and individual idiosyncrasies. This thesis introduces several key methodological advances: (1) Topological RSA (tRSA) for identifying computational signatures and testing topological hypotheses; (2) Adaptive Geo-Topological Dependence Measure (AGTDM) for detecting complex multivariate relationships; (3) Procrustes-aligned Multidimensional Scaling (pMDS) for revealing neural computation stages; (4) Temporal Topological Data Analysis (tTDA) for uncovering developmental trajectories; and (5) Single-cell Topological Simplicial Analysis (scTSA) for characterizing cell population complexity. Through analyses of neural recordings, biological data, and neural network simulations, this thesis demonstrates the power and versatility of these methods in understanding brains, computational models, and complex biological systems. They not only offer robust approaches for adjudicating among competing models but also reveal novel theoretical insights into the nature of neural computation. This work lays the foundation for future investigations at the intersection of topology, neuroscience, and time series analysis, paving the way for more nuanced understanding of brain function and dysfunction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11948v1-abstract-full').style.display = 'none'; document.getElementById('2408.11948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thesis defended by Baihan Lin (bl2681@columbia.edu) in 2023 for PhD in Computational Neuroscience at Columbia University; unifies and extends work from PNAS, WWW, CCN, ISMB, BIBM etc. (arXiv:2309.11028, 2203.05488, 1906.09264, 2204.14048, 1810.02923)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09144">arXiv:2408.09144</a> <span> [<a href="https://arxiv.org/pdf/2408.09144">pdf</a>, <a href="https://arxiv.org/format/2408.09144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SSNeRF: Sparse View Semi-supervised Neural Radiance Fields with Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiao Cao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Beibei Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiyong Huang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+R+T">Robby T. Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09144v1-abstract-short" style="display: inline;"> Sparse view NeRF is challenging because limited input images lead to an under constrained optimization problem for volume rendering. Existing methods address this issue by relying on supplementary information, such as depth maps. However, generating this supplementary information accurately remains problematic and often leads to NeRF producing images with undesired artifacts. To address these arti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09144v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09144v1-abstract-full" style="display: none;"> Sparse view NeRF is challenging because limited input images lead to an under constrained optimization problem for volume rendering. Existing methods address this issue by relying on supplementary information, such as depth maps. However, generating this supplementary information accurately remains problematic and often leads to NeRF producing images with undesired artifacts. To address these artifacts and enhance robustness, we propose SSNeRF, a sparse view semi supervised NeRF method based on a teacher student framework. Our key idea is to challenge the NeRF module with progressively severe sparse view degradation while providing high confidence pseudo labels. This approach helps the NeRF model become aware of noise and incomplete information associated with sparse views, thus improving its robustness. The novelty of SSNeRF lies in its sparse view specific augmentations and semi supervised learning mechanism. In this approach, the teacher NeRF generates novel views along with confidence scores, while the student NeRF, perturbed by the augmented input, learns from the high confidence pseudo labels. Our sparse view degradation augmentation progressively injects noise into volume rendering weights, perturbs feature maps in vulnerable layers, and simulates sparse view blurriness. These augmentation strategies force the student NeRF to recognize degradation and produce clearer rendered views. By transferring the student's parameters to the teacher, the teacher gains increased robustness in subsequent training iterations. Extensive experiments demonstrate the effectiveness of our SSNeRF in generating novel views with less sparse view degradation. We will release code upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09144v1-abstract-full').style.display = 'none'; document.getElementById('2408.09144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07758">arXiv:2408.07758</a> <span> [<a href="https://arxiv.org/pdf/2408.07758">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RAVE Checklist: Recommendations for Overcoming Challenges in Retrospective Safety Studies of Automated Driving Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Scanlon%2C+J+M">John M. Scanlon</a>, <a href="/search/cs?searchtype=author&query=Teoh%2C+E+R">Eric R. Teoh</a>, <a href="/search/cs?searchtype=author&query=Kidd%2C+D+G">David G. Kidd</a>, <a href="/search/cs?searchtype=author&query=Kusano%2C+K+D">Kristofer D. Kusano</a>, <a href="/search/cs?searchtype=author&query=B%C3%A4rgman%2C+J">Jonas B盲rgman</a>, <a href="/search/cs?searchtype=author&query=Chi-Johnston%2C+G">Geoffrey Chi-Johnston</a>, <a href="/search/cs?searchtype=author&query=Di+Lillo%2C+L">Luigi Di Lillo</a>, <a href="/search/cs?searchtype=author&query=Favaro%2C+F">Francesca Favaro</a>, <a href="/search/cs?searchtype=author&query=Flannagan%2C+C">Carol Flannagan</a>, <a href="/search/cs?searchtype=author&query=Liers%2C+H">Henrik Liers</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bonnie Lin</a>, <a href="/search/cs?searchtype=author&query=Lindman%2C+M">Magdalena Lindman</a>, <a href="/search/cs?searchtype=author&query=McLaughlin%2C+S">Shane McLaughlin</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+M">Miguel Perez</a>, <a href="/search/cs?searchtype=author&query=Victor%2C+T">Trent Victor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07758v1-abstract-short" style="display: inline;"> The public, regulators, and domain experts alike seek to understand the effect of deployed SAE level 4 automated driving system (ADS) technologies on safety. The recent expansion of ADS technology deployments is paving the way for early stage safety impact evaluations, whereby the observational data from both an ADS and a representative benchmark fleet are compared to quantify safety performance.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07758v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07758v1-abstract-full" style="display: none;"> The public, regulators, and domain experts alike seek to understand the effect of deployed SAE level 4 automated driving system (ADS) technologies on safety. The recent expansion of ADS technology deployments is paving the way for early stage safety impact evaluations, whereby the observational data from both an ADS and a representative benchmark fleet are compared to quantify safety performance. In January 2024, a working group of experts across academia, insurance, and industry came together in Washington, DC to discuss the current and future challenges in performing such evaluations. A subset of this working group then met, virtually, on multiple occasions to produce this paper. This paper presents the RAVE (Retrospective Automated Vehicle Evaluation) checklist, a set of fifteen recommendations for performing and evaluating retrospective ADS performance comparisons. The recommendations are centered around the concepts of (1) quality and validity, (2) transparency, and (3) interpretation. Over time, it is anticipated there will be a large and varied body of work evaluating the observed performance of these ADS fleets. Establishing and promoting good scientific practices benefits the work of stakeholders, many of whom may not be subject matter experts. This working group's intentions are to: i) strengthen individual research studies and ii) make the at-large community more informed on how to evaluate this collective body of work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07758v1-abstract-full').style.display = 'none'; document.getElementById('2408.07758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19548">arXiv:2407.19548</a> <span> [<a href="https://arxiv.org/pdf/2407.19548">pdf</a>, <a href="https://arxiv.org/format/2407.19548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cycle3D: High-quality and Consistent Image-to-3D Generation via Generation-Reconstruction Cycle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenyu Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junwu Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wangbo Yu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chaoran Feng</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yatian Pang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19548v1-abstract-short" style="display: inline;"> Recent 3D large reconstruction models typically employ a two-stage process, including first generate multi-view images by a multi-view diffusion model, and then utilize a feed-forward model to reconstruct images to 3D content.However, multi-view diffusion models often produce low-quality and inconsistent images, adversely affecting the quality of the final 3D reconstruction. To address this issue,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19548v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19548v1-abstract-full" style="display: none;"> Recent 3D large reconstruction models typically employ a two-stage process, including first generate multi-view images by a multi-view diffusion model, and then utilize a feed-forward model to reconstruct images to 3D content.However, multi-view diffusion models often produce low-quality and inconsistent images, adversely affecting the quality of the final 3D reconstruction. To address this issue, we propose a unified 3D generation framework called Cycle3D, which cyclically utilizes a 2D diffusion-based generation module and a feed-forward 3D reconstruction module during the multi-step diffusion process. Concretely, 2D diffusion model is applied for generating high-quality texture, and the reconstruction model guarantees multi-view consistency.Moreover, 2D diffusion model can further control the generated content and inject reference-view information for unseen views, thereby enhancing the diversity and texture consistency of 3D generation during the denoising process. Extensive experiments demonstrate the superior ability of our method to create 3D content with high-quality and consistency compared with state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19548v1-abstract-full').style.display = 'none'; document.getElementById('2407.19548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://pku-yuangroup.github.io/Cycle3D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19056">arXiv:2407.19056</a> <span> [<a href="https://arxiv.org/pdf/2407.19056">pdf</a>, <a href="https://arxiv.org/format/2407.19056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OfficeBench: Benchmarking Language Agents across Multiple Applications for Office Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilong Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuedong Cui</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Li Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zimin Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jingbo Shang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19056v1-abstract-short" style="display: inline;"> Office automation significantly enhances human productivity by automatically finishing routine tasks in the workflow. Beyond the basic information extraction studied in much of the prior document AI literature, the office automation research should be extended to more realistic office tasks which require to integrate various information sources in the office system and produce outputs through a se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19056v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19056v1-abstract-full" style="display: none;"> Office automation significantly enhances human productivity by automatically finishing routine tasks in the workflow. Beyond the basic information extraction studied in much of the prior document AI literature, the office automation research should be extended to more realistic office tasks which require to integrate various information sources in the office system and produce outputs through a series of decision-making processes. We introduce OfficeBench, one of the first office automation benchmarks for evaluating current LLM agents' capability to address office tasks in realistic office workflows. OfficeBench requires LLM agents to perform feasible long-horizon planning, proficiently switch between applications in a timely manner, and accurately ground their actions within a large combined action space, based on the contextual demands of the workflow. Applying our customized evaluation methods on each task, we find that GPT-4 Omni achieves the highest pass rate of 47.00%, demonstrating a decent performance in handling office tasks. However, this is still far below the human performance and accuracy standards required by real-world office workflows. We further observe that most issues are related to operation redundancy and hallucinations, as well as limitations in switching between multiple applications, which may provide valuable insights for developing effective agent frameworks for office automation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19056v1-abstract-full').style.display = 'none'; document.getElementById('2407.19056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14872">arXiv:2407.14872</a> <span> [<a href="https://arxiv.org/pdf/2407.14872">pdf</a>, <a href="https://arxiv.org/format/2407.14872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adapt2Reward: Adapting Video-Language Models to Generalizable Robotic Rewards via Failure Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanting Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minghao Chen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Q">Qibo Qiu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiahao Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Z">Ziyu Guan</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14872v1-abstract-short" style="display: inline;"> For a general-purpose robot to operate in reality, executing a broad range of instructions across various environments is imperative. Central to the reinforcement learning and planning for such robotic agents is a generalizable reward function. Recent advances in vision-language models, such as CLIP, have shown remarkable performance in the domain of deep learning, paving the way for open-domain v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14872v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14872v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14872v1-abstract-full" style="display: none;"> For a general-purpose robot to operate in reality, executing a broad range of instructions across various environments is imperative. Central to the reinforcement learning and planning for such robotic agents is a generalizable reward function. Recent advances in vision-language models, such as CLIP, have shown remarkable performance in the domain of deep learning, paving the way for open-domain visual recognition. However, collecting data on robots executing various language instructions across multiple environments remains a challenge. This paper aims to transfer video-language models with robust generalization into a generalizable language-conditioned reward function, only utilizing robot video data from a minimal amount of tasks in a singular environment. Unlike common robotic datasets used for training reward functions, human video-language datasets rarely contain trivial failure videos. To enhance the model's ability to distinguish between successful and failed robot executions, we cluster failure video features to enable the model to identify patterns within. For each cluster, we integrate a newly trained failure prompt into the text encoder to represent the corresponding failure mode. Our language-conditioned reward function shows outstanding generalization to new environments and new instructions for robot planning and reinforcement learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14872v1-abstract-full').style.display = 'none'; document.getElementById('2407.14872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10457">arXiv:2407.10457</a> <span> [<a href="https://arxiv.org/pdf/2407.10457">pdf</a>, <a href="https://arxiv.org/format/2407.10457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Good, The Bad, and The Greedy: Evaluation of LLMs Should Not Ignore Non-Determinism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10457v1-abstract-short" style="display: inline;"> Current evaluations of large language models (LLMs) often overlook non-determinism, typically focusing on a single output per example. This limits our understanding of LLM performance variability in real-world applications. Our study addresses this issue by exploring key questions about the performance differences between greedy decoding and sampling, identifying benchmarks' consistency regarding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10457v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10457v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10457v1-abstract-full" style="display: none;"> Current evaluations of large language models (LLMs) often overlook non-determinism, typically focusing on a single output per example. This limits our understanding of LLM performance variability in real-world applications. Our study addresses this issue by exploring key questions about the performance differences between greedy decoding and sampling, identifying benchmarks' consistency regarding non-determinism, and examining unique model behaviors. Through extensive experiments, we observe that greedy decoding generally outperforms sampling methods for most evaluated tasks. We also observe consistent performance across different LLM sizes and alignment methods, noting that alignment can reduce sampling variance. Moreover, our best-of-N sampling approach demonstrates that smaller LLMs can match or surpass larger models such as GPT-4-Turbo, highlighting the untapped potential of smaller LLMs. This research shows the importance of considering non-determinism in LLM evaluations and provides insights for future LLM development and evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10457v1-abstract-full').style.display = 'none'; document.getElementById('2407.10457v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09992">arXiv:2407.09992</a> <span> [<a href="https://arxiv.org/pdf/2407.09992">pdf</a>, <a href="https://arxiv.org/format/2407.09992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> TOP:A New Target-Audience Oriented Content Paraphrase Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Boda Lin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiaxin Shi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haolong Yan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Binghao Tang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiaocheng Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Si Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09992v1-abstract-short" style="display: inline;"> Recommendation systems usually recommend the existing contents to different users. However, in comparison to static recommendation methods, a recommendation logic that dynamically adjusts based on user interest preferences may potentially attract a larger user base. Thus, we consider paraphrasing existing content based on the interests of the users to modify the content to better align with the pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09992v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09992v1-abstract-full" style="display: none;"> Recommendation systems usually recommend the existing contents to different users. However, in comparison to static recommendation methods, a recommendation logic that dynamically adjusts based on user interest preferences may potentially attract a larger user base. Thus, we consider paraphrasing existing content based on the interests of the users to modify the content to better align with the preferences of users. In this paper, we propose a new task named Target-Audience Oriented Content Paraphrase aims to generate more customized contents for the target audience. We introduce the task definition and the corresponding framework for the proposed task and the creation of the corresponding datasets. We utilize the Large Language Models (LLMs) and Large Vision Models (LVMs) to accomplish the base implementation of the TOP framework and provide the referential baseline results for the proposed task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09992v1-abstract-full').style.display = 'none'; document.getElementById('2407.09992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09787">arXiv:2407.09787</a> <span> [<a href="https://arxiv.org/pdf/2407.09787">pdf</a>, <a href="https://arxiv.org/format/2407.09787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-supervised 3D Object Detection with PatchTeacher and PillarMix </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaopei Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Liang Peng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yuenan Hou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaoshui Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Deng Cai</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09787v1-abstract-short" style="display: inline;"> Semi-supervised learning aims to leverage numerous unlabeled data to improve the model performance. Current semi-supervised 3D object detection methods typically use a teacher to generate pseudo labels for a student, and the quality of the pseudo labels is essential for the final performance. In this paper, we propose PatchTeacher, which focuses on partial scene 3D object detection to provide high… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09787v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09787v1-abstract-full" style="display: none;"> Semi-supervised learning aims to leverage numerous unlabeled data to improve the model performance. Current semi-supervised 3D object detection methods typically use a teacher to generate pseudo labels for a student, and the quality of the pseudo labels is essential for the final performance. In this paper, we propose PatchTeacher, which focuses on partial scene 3D object detection to provide high-quality pseudo labels for the student. Specifically, we divide a complete scene into a series of patches and feed them to our PatchTeacher sequentially. PatchTeacher leverages the low memory consumption advantage of partial scene detection to process point clouds with a high-resolution voxelization, which can minimize the information loss of quantization and extract more fine-grained features. However, it is non-trivial to train a detector on fractions of the scene. Therefore, we introduce three key techniques, i.e., Patch Normalizer, Quadrant Align, and Fovea Selection, to improve the performance of PatchTeacher. Moreover, we devise PillarMix, a strong data augmentation strategy that mixes truncated pillars from different LiDAR scans to generate diverse training samples and thus help the model learn more general representation. Extensive experiments conducted on Waymo and ONCE datasets verify the effectiveness and superiority of our method and we achieve new state-of-the-art results, surpassing existing methods by a large margin. Codes are available at https://github.com/LittlePey/PTPM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09787v1-abstract-full').style.display = 'none'; document.getElementById('2407.09787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09751">arXiv:2407.09751</a> <span> [<a href="https://arxiv.org/pdf/2407.09751">pdf</a>, <a href="https://arxiv.org/format/2407.09751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TASeg: Temporal Aggregation Network for LiDAR Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaopei Wu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yuenan Hou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaoshui Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinge Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuexin Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Boxi Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Deng Cai</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09751v1-abstract-short" style="display: inline;"> Training deep models for LiDAR semantic segmentation is challenging due to the inherent sparsity of point clouds. Utilizing temporal data is a natural remedy against the sparsity problem as it makes the input signal denser. However, previous multi-frame fusion algorithms fall short in utilizing sufficient temporal information due to the memory constraint, and they also ignore the informative tempo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09751v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09751v1-abstract-full" style="display: none;"> Training deep models for LiDAR semantic segmentation is challenging due to the inherent sparsity of point clouds. Utilizing temporal data is a natural remedy against the sparsity problem as it makes the input signal denser. However, previous multi-frame fusion algorithms fall short in utilizing sufficient temporal information due to the memory constraint, and they also ignore the informative temporal images. To fully exploit rich information hidden in long-term temporal point clouds and images, we present the Temporal Aggregation Network, termed TASeg. Specifically, we propose a Temporal LiDAR Aggregation and Distillation (TLAD) algorithm, which leverages historical priors to assign different aggregation steps for different classes. It can largely reduce memory and time overhead while achieving higher accuracy. Besides, TLAD trains a teacher injected with gt priors to distill the model, further boosting the performance. To make full use of temporal images, we design a Temporal Image Aggregation and Fusion (TIAF) module, which can greatly expand the camera FOV and enhance the present features. Temporal LiDAR points in the camera FOV are used as mediums to transform temporal image features to the present coordinate for temporal multi-modal fusion. Moreover, we develop a Static-Moving Switch Augmentation (SMSA) algorithm, which utilizes sufficient temporal information to enable objects to switch their motion states freely, thus greatly increasing static and moving training samples. Our TASeg ranks 1st on three challenging tracks, i.e., SemanticKITTI single-scan track, multi-scan track and nuScenes LiDAR segmentation track, strongly demonstrating the superiority of our method. Codes are available at https://github.com/LittlePey/TASeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09751v1-abstract-full').style.display = 'none'; document.getElementById('2407.09751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05890">arXiv:2407.05890</a> <span> [<a href="https://arxiv.org/pdf/2407.05890">pdf</a>, <a href="https://arxiv.org/format/2407.05890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Affordances-Oriented Planning using Foundation Models for Continuous Vision-Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaqi Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bingqian Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinmin Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lin Ma</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K+K">Kwan-Yee K. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05890v2-abstract-short" style="display: inline;"> LLM-based agents have demonstrated impressive zero-shot performance in vision-language navigation (VLN) task. However, existing LLM-based methods often focus only on solving high-level task planning by selecting nodes in predefined navigation graphs for movements, overlooking low-level control in navigation scenarios. To bridge this gap, we propose AO-Planner, a novel Affordances-Oriented Planner… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05890v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05890v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05890v2-abstract-full" style="display: none;"> LLM-based agents have demonstrated impressive zero-shot performance in vision-language navigation (VLN) task. However, existing LLM-based methods often focus only on solving high-level task planning by selecting nodes in predefined navigation graphs for movements, overlooking low-level control in navigation scenarios. To bridge this gap, we propose AO-Planner, a novel Affordances-Oriented Planner for continuous VLN task. Our AO-Planner integrates various foundation models to achieve affordances-oriented low-level motion planning and high-level decision-making, both performed in a zero-shot setting. Specifically, we employ a Visual Affordances Prompting (VAP) approach, where the visible ground is segmented by SAM to provide navigational affordances, based on which the LLM selects potential candidate waypoints and plans low-level paths towards selected waypoints. We further propose a high-level PathAgent which marks planned paths into the image input and reasons the most probable path by comprehending all environmental information. Finally, we convert the selected path into 3D coordinates using camera intrinsic parameters and depth information, avoiding challenging 3D predictions for LLMs. Experiments on the challenging R2R-CE and RxR-CE datasets show that AO-Planner achieves state-of-the-art zero-shot performance (8.8% improvement on SPL). Our method can also serve as a data annotator to obtain pseudo-labels, distilling its waypoint prediction ability into a learning-based predictor. This new predictor does not require any waypoint data from the simulator and achieves 47% SR competing with supervised methods. We establish an effective connection between LLM and 3D world, presenting novel prospects for employing foundation models in low-level motion control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05890v2-abstract-full').style.display = 'none'; document.getElementById('2407.05890v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02228">arXiv:2407.02228</a> <span> [<a href="https://arxiv.org/pdf/2407.02228">pdf</a>, <a href="https://arxiv.org/format/2407.02228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MTMamba: Enhancing Multi-Task Dense Scene Understanding by Mamba-Based Decoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Baijiong Lin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weisen Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengguang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02228v2-abstract-short" style="display: inline;"> Multi-task dense scene understanding, which learns a model for multiple dense prediction tasks, has a wide range of application scenarios. Modeling long-range dependency and enhancing cross-task interactions are crucial to multi-task dense prediction. In this paper, we propose MTMamba, a novel Mamba-based architecture for multi-task scene understanding. It contains two types of core blocks: self-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02228v2-abstract-full').style.display = 'inline'; document.getElementById('2407.02228v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02228v2-abstract-full" style="display: none;"> Multi-task dense scene understanding, which learns a model for multiple dense prediction tasks, has a wide range of application scenarios. Modeling long-range dependency and enhancing cross-task interactions are crucial to multi-task dense prediction. In this paper, we propose MTMamba, a novel Mamba-based architecture for multi-task scene understanding. It contains two types of core blocks: self-task Mamba (STM) block and cross-task Mamba (CTM) block. STM handles long-range dependency by leveraging Mamba, while CTM explicitly models task interactions to facilitate information exchange across tasks. Experiments on NYUDv2 and PASCAL-Context datasets demonstrate the superior performance of MTMamba over Transformer-based and CNN-based methods. Notably, on the PASCAL-Context dataset, MTMamba achieves improvements of +2.08, +5.01, and +4.90 over the previous best methods in the tasks of semantic segmentation, human parsing, and object boundary detection, respectively. The code is available at https://github.com/EnVision-Research/MTMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02228v2-abstract-full').style.display = 'none'; document.getElementById('2407.02228v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18495">arXiv:2406.18495</a> <span> [<a href="https://arxiv.org/pdf/2406.18495">pdf</a>, <a href="https://arxiv.org/format/2406.18495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+S">Seungju Han</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+K">Kavel Rao</a>, <a href="/search/cs?searchtype=author&query=Ettinger%2C+A">Allyson Ettinger</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Liwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Dziri%2C+N">Nouha Dziri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18495v2-abstract-short" style="display: inline;"> We introduce WildGuard -- an open, light-weight moderation tool for LLM safety that achieves three goals: (1) identifying malicious intent in user prompts, (2) detecting safety risks of model responses, and (3) determining model refusal rate. Together, WildGuard serves the increasing needs for automatic safety moderation and evaluation of LLM interactions, providing a one-stop tool with enhanced a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18495v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18495v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18495v2-abstract-full" style="display: none;"> We introduce WildGuard -- an open, light-weight moderation tool for LLM safety that achieves three goals: (1) identifying malicious intent in user prompts, (2) detecting safety risks of model responses, and (3) determining model refusal rate. Together, WildGuard serves the increasing needs for automatic safety moderation and evaluation of LLM interactions, providing a one-stop tool with enhanced accuracy and broad coverage across 13 risk categories. While existing open moderation tools such as Llama-Guard2 score reasonably well in classifying straightforward model interactions, they lag far behind a prompted GPT-4, especially in identifying adversarial jailbreaks and in evaluating models' refusals, a key measure for evaluating safety behaviors in model responses. To address these challenges, we construct WildGuardMix, a large-scale and carefully balanced multi-task safety moderation dataset with 92K labeled examples that cover vanilla (direct) prompts and adversarial jailbreaks, paired with various refusal and compliance responses. WildGuardMix is a combination of WildGuardTrain, the training data of WildGuard, and WildGuardTest, a high-quality human-annotated moderation test set with 5K labeled items covering broad risk scenarios. Through extensive evaluations on WildGuardTest and ten existing public benchmarks, we show that WildGuard establishes state-of-the-art performance in open-source safety moderation across all the three tasks compared to ten strong existing open-source moderation models (e.g., up to 26.4% improvement on refusal detection). Importantly, WildGuard matches and sometimes exceeds GPT-4 performance (e.g., up to 3.9% improvement on prompt harmfulness identification). WildGuard serves as a highly effective safety moderator in an LLM interface, reducing the success rate of jailbreak attacks from 79.8% to 2.4%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18495v2-abstract-full').style.display = 'none'; document.getElementById('2406.18495v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally. Third and fourth authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12935">arXiv:2406.12935</a> <span> [<a href="https://arxiv.org/pdf/2406.12935">pdf</a>, <a href="https://arxiv.org/format/2406.12935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ChatBug: A Common Vulnerability of Aligned LLMs Induced by Chat Templates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fengqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhangchen Xu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+L">Luyao Niu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Poovendran%2C+R">Radha Poovendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12935v1-abstract-short" style="display: inline;"> Large language models (LLMs) are expected to follow instructions from users and engage in conversations. Techniques to enhance LLMs' instruction-following capabilities typically fine-tune them using data structured according to a predefined chat template. Although chat templates are shown to be effective in optimizing LLM performance, their impact on safety alignment of LLMs has been less understo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12935v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12935v1-abstract-full" style="display: none;"> Large language models (LLMs) are expected to follow instructions from users and engage in conversations. Techniques to enhance LLMs' instruction-following capabilities typically fine-tune them using data structured according to a predefined chat template. Although chat templates are shown to be effective in optimizing LLM performance, their impact on safety alignment of LLMs has been less understood, which is crucial for deploying LLMs safely at scale. In this paper, we investigate how chat templates affect safety alignment of LLMs. We identify a common vulnerability, named ChatBug, that is introduced by chat templates. Our key insight to identify ChatBug is that the chat templates provide a rigid format that need to be followed by LLMs, but not by users. Hence, a malicious user may not necessarily follow the chat template when prompting LLMs. Instead, malicious users could leverage their knowledge of the chat template and accordingly craft their prompts to bypass safety alignments of LLMs. We develop two attacks to exploit the ChatBug vulnerability. We demonstrate that a malicious user can exploit the ChatBug vulnerability of eight state-of-the-art (SOTA) LLMs and effectively elicit unintended responses from these models. Moreover, we show that ChatBug can be exploited by existing jailbreak attacks to enhance their attack success rates. We investigate potential countermeasures to ChatBug. Our results show that while adversarial training effectively mitigates the ChatBug vulnerability, the victim model incurs significant performance degradation. These results highlight the trade-off between safety alignment and helpfulness. Developing new methods for instruction tuning to balance this trade-off is an open and critical direction for future research <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12935v1-abstract-full').style.display = 'none'; document.getElementById('2406.12935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11069">arXiv:2406.11069</a> <span> [<a href="https://arxiv.org/pdf/2406.11069">pdf</a>, <a href="https://arxiv.org/format/2406.11069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WildVision: Evaluating Vision-Language Models in the Wild with Human Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yujie Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongfu Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11069v1-abstract-short" style="display: inline;"> Recent breakthroughs in vision-language models (VLMs) emphasize the necessity of benchmarking human preferences in real-world multimodal interactions. To address this gap, we launched WildVision-Arena (WV-Arena), an online platform that collects human preferences to evaluate VLMs. We curated WV-Bench by selecting 500 high-quality samples from 8,000 user submissions in WV-Arena. WV-Bench uses GPT-4… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11069v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11069v1-abstract-full" style="display: none;"> Recent breakthroughs in vision-language models (VLMs) emphasize the necessity of benchmarking human preferences in real-world multimodal interactions. To address this gap, we launched WildVision-Arena (WV-Arena), an online platform that collects human preferences to evaluate VLMs. We curated WV-Bench by selecting 500 high-quality samples from 8,000 user submissions in WV-Arena. WV-Bench uses GPT-4 as the judge to compare each VLM with Claude-3-Sonnet, achieving a Spearman correlation of 0.94 with the WV-Arena Elo. This significantly outperforms other benchmarks like MMVet, MMMU, and MMStar. Our comprehensive analysis of 20K real-world interactions reveals important insights into the failure cases of top-performing VLMs. For example, we find that although GPT-4V surpasses many other models like Reka-Flash, Opus, and Yi-VL-Plus in simple visual recognition and reasoning tasks, it still faces challenges with subtle contextual cues, spatial reasoning, visual imagination, and expert domain knowledge. Additionally, current VLMs exhibit issues with hallucinations and safety when intentionally provoked. We are releasing our chat and feedback data to further advance research in the field of VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11069v1-abstract-full').style.display = 'none'; document.getElementById('2406.11069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">link: https://hf.co/spaces/WildVision/vision-arena</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>