Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 495 results for author: <span class="mathjax">Zhu, M</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhu%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhu, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhu%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhu, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13547">arXiv:2411.13547</a> <span> [<a href="https://arxiv.org/pdf/2411.13547">pdf</a>, <a href="https://arxiv.org/format/2411.13547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecTool: A Benchmark for Characterizing Errors in Tool-Use LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Awalgaonkar%2C+T">Tulika Awalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silivo Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13547v1-abstract-short" style="display: inline;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13547v1-abstract-full" style="display: none;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only give a success rate without any explanation of the failure cases. To solve this problem, we introduce SpecTool, a new benchmark to identify error patterns in LLM output on tool-use tasks. Our benchmark data set comprises of queries from diverse environments that can be used to test for the presence of seven newly characterized error patterns. Using SPECTOOL , we show that even the most prominent LLMs exhibit these error patterns in their outputs. Researchers can use the analysis and insights from SPECTOOL to guide their error mitigation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'none'; document.getElementById('2411.13547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span> [<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">JingWen Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v1-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v1-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v1-abstract-full').style.display = 'none'; document.getElementById('2411.12814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11525">arXiv:2411.11525</a> <span> [<a href="https://arxiv.org/pdf/2411.11525">pdf</a>, <a href="https://arxiv.org/format/2411.11525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reliable Poisoned Sample Detection against Backdoor Attacks Enhanced by Sharpness Aware Minimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingda Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingli Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11525v1-abstract-short" style="display: inline;"> Backdoor attack has been considered as a serious security threat to deep neural networks (DNNs). Poisoned sample detection (PSD) that aims at filtering out poisoned samples from an untrustworthy training dataset has shown very promising performance for defending against data poisoning based backdoor attacks. However, we observe that the detection performance of many advanced methods is likely to b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11525v1-abstract-full" style="display: none;"> Backdoor attack has been considered as a serious security threat to deep neural networks (DNNs). Poisoned sample detection (PSD) that aims at filtering out poisoned samples from an untrustworthy training dataset has shown very promising performance for defending against data poisoning based backdoor attacks. However, we observe that the detection performance of many advanced methods is likely to be unstable when facing weak backdoor attacks, such as low poisoning ratio or weak trigger strength. To further verify this observation, we make a statistical investigation among various backdoor attacks and poisoned sample detections, showing a positive correlation between backdoor effect and detection performance. It inspires us to strengthen the backdoor effect to enhance detection performance. Since we cannot achieve that goal via directly manipulating poisoning ratio or trigger strength, we propose to train one model using the Sharpness-Aware Minimization (SAM) algorithm, rather than the vanilla training algorithm. We also provide both empirical and theoretical analysis about how SAM training strengthens the backdoor effect. Then, this SAM trained model can be seamlessly integrated with any off-the-shelf PSD method that extracts discriminative features from the trained model for detection, called SAM-enhanced PSD. Extensive experiments on several benchmark datasets show the reliable detection performance of the proposed method against both weak and strong backdoor attacks, with significant improvements against various attacks ($+34.38\%$ TPR on average), over the conventional PSD methods (i.e., without SAM enhancement). Overall, this work provides new insights about PSD and proposes a novel approach that can complement existing detection methods, which may inspire more in-depth explorations in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11525v1-abstract-full').style.display = 'none'; document.getElementById('2411.11525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10915">arXiv:2411.10915</a> <span> [<a href="https://arxiv.org/pdf/2411.10915">pdf</a>, <a href="https://arxiv.org/ps/2411.10915">ps</a>, <a href="https://arxiv.org/format/2411.10915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bias in Large Language Models: Origin, Evaluation, and Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yufei Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Muzhe Guo</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Juntao Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhou Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengqiu Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongfei Li</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Mengyang Qiu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S+S">Shuo Shuo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10915v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have revolutionized natural language processing, but their susceptibility to biases poses significant challenges. This comprehensive review examines the landscape of bias in LLMs, from its origins to current mitigation strategies. We categorize biases as intrinsic and extrinsic, analyzing their manifestations in various NLP tasks. The review critically assesses a range… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10915v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10915v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10915v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have revolutionized natural language processing, but their susceptibility to biases poses significant challenges. This comprehensive review examines the landscape of bias in LLMs, from its origins to current mitigation strategies. We categorize biases as intrinsic and extrinsic, analyzing their manifestations in various NLP tasks. The review critically assesses a range of bias evaluation methods, including data-level, model-level, and output-level approaches, providing researchers with a robust toolkit for bias detection. We further explore mitigation strategies, categorizing them into pre-model, intra-model, and post-model techniques, highlighting their effectiveness and limitations. Ethical and legal implications of biased LLMs are discussed, emphasizing potential harms in real-world applications such as healthcare and criminal justice. By synthesizing current knowledge on bias in LLMs, this review contributes to the ongoing effort to develop fair and responsible AI systems. Our work serves as a comprehensive resource for researchers and practitioners working towards understanding, evaluating, and mitigating bias in LLMs, fostering the development of more equitable AI technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10915v1-abstract-full').style.display = 'none'; document.getElementById('2411.10915v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09365">arXiv:2411.09365</a> <span> [<a href="https://arxiv.org/pdf/2411.09365">pdf</a>, <a href="https://arxiv.org/format/2411.09365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Stability and Generalization for Distributed SGDA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Miaoxi Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yan Sun</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09365v1-abstract-short" style="display: inline;"> Minimax optimization is gaining increasing attention in modern machine learning applications. Driven by large-scale models and massive volumes of data collected from edge devices, as well as the concern to preserve client privacy, communication-efficient distributed minimax optimization algorithms become popular, such as Local Stochastic Gradient Descent Ascent (Local-SGDA), and Local Decentralize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09365v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09365v1-abstract-full" style="display: none;"> Minimax optimization is gaining increasing attention in modern machine learning applications. Driven by large-scale models and massive volumes of data collected from edge devices, as well as the concern to preserve client privacy, communication-efficient distributed minimax optimization algorithms become popular, such as Local Stochastic Gradient Descent Ascent (Local-SGDA), and Local Decentralized SGDA (Local-DSGDA). While most existing research on distributed minimax algorithms focuses on convergence rates, computation complexity, and communication efficiency, the generalization performance remains underdeveloped, whereas generalization ability is a pivotal indicator for evaluating the holistic performance of a model when fed with unknown data. In this paper, we propose the stability-based generalization analytical framework for Distributed-SGDA, which unifies two popular distributed minimax algorithms including Local-SGDA and Local-DSGDA, and conduct a comprehensive analysis of stability error, generalization gap, and population risk across different metrics under various settings, e.g., (S)C-(S)C, PL-SC, and NC-NC cases. Our theoretical results reveal the trade-off between the generalization gap and optimization error and suggest hyperparameters choice to obtain the optimal population risk. Numerical experiments for Local-SGDA and Local-DSGDA validate the theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09365v1-abstract-full').style.display = 'none'; document.getElementById('2411.09365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07618">arXiv:2411.07618</a> <span> [<a href="https://arxiv.org/pdf/2411.07618">pdf</a>, <a href="https://arxiv.org/format/2411.07618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Direct Preference Optimization Using Sparse Feature-Level Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hanqi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yulan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07618v1-abstract-short" style="display: inline;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimizat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07618v1-abstract-full" style="display: none;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimization (FPO), a novel method designed to simplify the alignment process while ensuring stability. FPO leverages pre-trained Sparse Autoencoders (SAEs) and introduces feature-level constraints, allowing for efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using sparse features activated in a well-trained sparse autoencoder and the quality of sequential KL divergence by using the feature-level offline reference. Experimental results on benchmark datasets demonstrate that FPO achieves a 5.08% absolute improvement in win rate with much lower computational cost compared to state-of-the-art baselines, making it a promising solution for efficient and controllable LLM alignments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'none'; document.getElementById('2411.07618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03402">arXiv:2411.03402</a> <span> [<a href="https://arxiv.org/pdf/2411.03402">pdf</a>, <a href="https://arxiv.org/format/2411.03402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Portfolio Management">q-fin.PM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Climate AI for Corporate Decarbonization Metrics Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dave%2C+A">Aditya Dave</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengchen Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dapeng Hu</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+S">Sachin Tiwari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03402v1-abstract-short" style="display: inline;"> Corporate Greenhouse Gas (GHG) emission targets are important metrics in sustainable investing [12, 16]. To provide a comprehensive view of company emission objectives, we propose an approach to source these metrics from company public disclosures. Without automation, curating these metrics manually is a labor-intensive process that requires combing through lengthy corporate sustainability disclos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03402v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03402v1-abstract-full" style="display: none;"> Corporate Greenhouse Gas (GHG) emission targets are important metrics in sustainable investing [12, 16]. To provide a comprehensive view of company emission objectives, we propose an approach to source these metrics from company public disclosures. Without automation, curating these metrics manually is a labor-intensive process that requires combing through lengthy corporate sustainability disclosures that often do not follow a standard format. Furthermore, the resulting dataset needs to be validated thoroughly by Subject Matter Experts (SMEs), further lengthening the time-to-market. We introduce the Climate Artificial Intelligence for Corporate Decarbonization Metrics Extraction (CAI) model and pipeline, a novel approach utilizing Large Language Models (LLMs) to extract and validate linked metrics from corporate disclosures. We demonstrate that the process improves data collection efficiency and accuracy by automating data curation, validation, and metric scoring from public corporate disclosures. We further show that our results are agnostic to the choice of LLMs. This framework can be applied broadly to information extraction from textual data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03402v1-abstract-full').style.display = 'none'; document.getElementById('2411.03402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02028">arXiv:2411.02028</a> <span> [<a href="https://arxiv.org/pdf/2411.02028">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An Immediate Update Strategy of Multi-State Constraint Kalman Filter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wei Ouyang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiale Han</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Q">Qi Cai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Maoran Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuanxin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02028v1-abstract-short" style="display: inline;"> The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been well-known for its high efficiency, in which the delayed update has been usually adopted since its proposal. This work investigates the immediate update strategy of MSCKF based on timely reconstructed 3D feature points and measurement constraints. The differences between the delayed update and the immediate update are theoretica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02028v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02028v1-abstract-full" style="display: none;"> The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been well-known for its high efficiency, in which the delayed update has been usually adopted since its proposal. This work investigates the immediate update strategy of MSCKF based on timely reconstructed 3D feature points and measurement constraints. The differences between the delayed update and the immediate update are theoretically analyzed in detail. It is found that the immediate update helps construct more observation constraints and employ more filtering updates than the delayed update, which improves the linearization point of the measurement model and therefore enhances the estimation accuracy. Numerical simulations and experiments show that the immediate update strategy significantly enhances MSCKF even with a small amount of feature observations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02028v1-abstract-full').style.display = 'none'; document.getElementById('2411.02028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01647">arXiv:2411.01647</a> <span> [<a href="https://arxiv.org/pdf/2411.01647">pdf</a>, <a href="https://arxiv.org/format/2411.01647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Optical Flow Representation Alignment Mamba Diffusion Model for Medical Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenbin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lituan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenwei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01647v1-abstract-short" style="display: inline;"> Medical video generation models are expected to have a profound impact on the healthcare industry, including but not limited to medical education and training, surgical planning, and simulation. Current video diffusion models typically build on image diffusion architecture by incorporating temporal operations (such as 3D convolution and temporal attention). Although this approach is effective, its… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01647v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01647v1-abstract-full" style="display: none;"> Medical video generation models are expected to have a profound impact on the healthcare industry, including but not limited to medical education and training, surgical planning, and simulation. Current video diffusion models typically build on image diffusion architecture by incorporating temporal operations (such as 3D convolution and temporal attention). Although this approach is effective, its oversimplification limits spatio-temporal performance and consumes substantial computational resources. To counter this, we propose Medical Simulation Video Generator (MedSora), which incorporates three key elements: i) a video diffusion framework integrates the advantages of attention and Mamba, balancing low computational load with high-quality video generation, ii) an optical flow representation alignment method that implicitly enhances attention to inter-frame pixels, and iii) a video variational autoencoder (VAE) with frequency compensation addresses the information loss of medical features that occurs when transforming pixel space into latent features and then back to pixel frames. Extensive experiments and applications demonstrate that MedSora exhibits superior visual quality in generating medical videos, outperforming the most advanced baseline methods. Further results and code are available at https://wongzbb.github.io/MedSora <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01647v1-abstract-full').style.display = 'none'; document.getElementById('2411.01647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00816">arXiv:2411.00816</a> <span> [<a href="https://arxiv.org/pdf/2411.00816">pdf</a>, <a href="https://arxiv.org/format/2411.00816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CycleResearcher: Improving Automated Research via Automated Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yixuan Weng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+G">Guangsheng Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00816v1-abstract-short" style="display: inline;"> The automation of scientific discovery has been a long-standing goal within the research community, driven by the potential to accelerate knowledge creation. While significant progress has been made using commercial large language models (LLMs) as research assistants or idea generators, the possibility of automating the entire research process with open-source LLMs remains largely unexplored. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00816v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00816v1-abstract-full" style="display: none;"> The automation of scientific discovery has been a long-standing goal within the research community, driven by the potential to accelerate knowledge creation. While significant progress has been made using commercial large language models (LLMs) as research assistants or idea generators, the possibility of automating the entire research process with open-source LLMs remains largely unexplored. This paper explores the feasibility of using open-source post-trained LLMs as autonomous agents capable of performing the full cycle of automated research and review, from literature review and manuscript preparation to peer review and paper revision. Our iterative preference training framework consists of CycleResearcher, which conducts research tasks, and CycleReviewer, which simulates the peer review process, providing iterative feedback via reinforcement learning. To train these models, we develop two new datasets, Review-5k and Research-14k, reflecting real-world machine learning research and peer review dynamics. Our results demonstrate that CycleReviewer achieves a 26.89\% improvement in mean absolute error (MAE) over individual human reviewers in predicting paper scores, indicating that LLMs can surpass expert-level performance in research evaluation. In research, the papers generated by the CycleResearcher model achieved a score of 5.36 in simulated peer reviews, surpassing the preprint level of 5.24 from human experts and approaching the accepted paper level of 5.69. This work represents a significant step toward fully automated scientific inquiry, providing ethical safeguards and advancing AI-driven research capabilities. The code, dataset and model weight are released at \url{http://github/minjun-zhu/Researcher}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00816v1-abstract-full').style.display = 'none'; document.getElementById('2411.00816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18742">arXiv:2410.18742</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Continuous Dynamic Modeling via Neural ODEs for Popularity Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Songbo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zihang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haotian Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengxiao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18742v2-abstract-short" style="display: inline;"> Popularity prediction for information cascades has significant applications across various domains, including opinion monitoring and advertising recommendations. While most existing methods consider this as a discrete problem, popularity actually evolves continuously, exhibiting rich dynamic properties such as change rates and growth patterns. In this paper, we argue that popularity trajectory pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18742v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18742v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18742v2-abstract-full" style="display: none;"> Popularity prediction for information cascades has significant applications across various domains, including opinion monitoring and advertising recommendations. While most existing methods consider this as a discrete problem, popularity actually evolves continuously, exhibiting rich dynamic properties such as change rates and growth patterns. In this paper, we argue that popularity trajectory prediction is more practical, as it aims to forecast the entire trajectory of how popularity unfolds over arbitrary future time. This approach offers insights into both instantaneous popularity and the underlying dynamic properties. However, traditional methods for popularity trajectory prediction primarily rely on specific diffusion mechanism assumptions, which may not align well with real-world dynamics and compromise their performance. To address these limitations, we propose NODEPT, a novel approach based on neural ordinary differential equations (ODEs) for popularity trajectory prediction. NODEPT models the continuous dynamics of the underlying diffusion system using neural ODEs. We first employ an encoder to initialize the latent state representations of information cascades, consisting of two representation learning modules that capture the co-evolution structural characteristics and temporal patterns of cascades from different perspectives. More importantly, we then introduce an ODE-based generative module that learns the dynamics of the diffusion system in the latent space. Finally, a decoder transforms the latent state into the prediction of the future popularity trajectory. Our experimental results on three real-world datasets demonstrate the superiority and rationality of the proposed NODEPT method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18742v2-abstract-full').style.display = 'none'; document.getElementById('2410.18742v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The time complexity analysis in section 4.4 contains error; we overlooked the impact of the memory module</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18528">arXiv:2410.18528</a> <span> [<a href="https://arxiv.org/pdf/2410.18528">pdf</a>, <a href="https://arxiv.org/format/2410.18528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PRACT: Optimizing Principled Reasoning and Acting of LLM Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18528v1-abstract-short" style="display: inline;"> We introduce the Principled Reasoning and Acting (PRAct) framework, a novel method for learning and enforcing action principles from trajectory data. Central to our approach is the use of text gradients from a reflection and optimization engine to derive these action principles. To adapt action principles to specific task requirements, we propose a new optimization framework, Reflective Principle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18528v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18528v1-abstract-full" style="display: none;"> We introduce the Principled Reasoning and Acting (PRAct) framework, a novel method for learning and enforcing action principles from trajectory data. Central to our approach is the use of text gradients from a reflection and optimization engine to derive these action principles. To adapt action principles to specific task requirements, we propose a new optimization framework, Reflective Principle Optimization (RPO). After execution, RPO employs a reflector to critique current action principles and an optimizer to update them accordingly. We develop the RPO framework under two scenarios: Reward-RPO, which uses environmental rewards for reflection, and Self-RPO, which conducts self-reflection without external rewards. Additionally, two RPO methods, RPO-Traj and RPO-Batch, is introduced to adapt to different settings. Experimental results across four environments demonstrate that the PRAct agent, leveraging the RPO framework, effectively learns and applies action principles to enhance performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18528v1-abstract-full').style.display = 'none'; document.getElementById('2410.18528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIG CoNLL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16795">arXiv:2410.16795</a> <span> [<a href="https://arxiv.org/pdf/2410.16795">pdf</a>, <a href="https://arxiv.org/format/2410.16795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Traj-Explainer: An Explainable and Robust Multi-modal Trajectory Prediction Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiqun Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Ziyuan Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16795v1-abstract-short" style="display: inline;"> Navigating complex traffic environments has been significantly enhanced by advancements in intelligent technologies, enabling accurate environment perception and trajectory prediction for automated vehicles. However, existing research often neglects the consideration of the joint reasoning of scenario agents and lacks interpretability in trajectory prediction models, thereby limiting their practic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16795v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16795v1-abstract-full" style="display: none;"> Navigating complex traffic environments has been significantly enhanced by advancements in intelligent technologies, enabling accurate environment perception and trajectory prediction for automated vehicles. However, existing research often neglects the consideration of the joint reasoning of scenario agents and lacks interpretability in trajectory prediction models, thereby limiting their practical application in real-world scenarios. To this purpose, an explainability-oriented trajectory prediction model is designed in this work, named Explainable Conditional Diffusion based Multimodal Trajectory Prediction Traj-Explainer, to retrieve the influencing factors of prediction and help understand the intrinsic mechanism of prediction. In Traj-Explainer, a modified conditional diffusion is well designed to capture the scenario multimodal trajectory pattern, and meanwhile, a modified Shapley Value model is assembled to rationally learn the importance of the global and scenario features. Numerical experiments are carried out by several trajectory prediction datasets, including Waymo, NGSIM, HighD, and MoCAD datasets. Furthermore, we evaluate the identified input factors which indicates that they are in agreement with the human driving experience, indicating the capability of the proposed model in appropriately learning the prediction. Code available in our open-source repository: \url{https://anonymous.4open.science/r/Interpretable-Prediction}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16795v1-abstract-full').style.display = 'none'; document.getElementById('2410.16795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16644">arXiv:2410.16644</a> <span> [<a href="https://arxiv.org/pdf/2410.16644">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CKSP: Cross-species Knowledge Sharing and Preserving for Universal Animal Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+A">Axiu Mao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meilu Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhaojin Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zheng He</a>, <a href="/search/cs?searchtype=author&query=Norton%2C+T">Tomas Norton</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16644v1-abstract-short" style="display: inline;"> Deep learning techniques are dominating automated animal activity recognition (AAR) tasks with wearable sensors due to their high performance on large-scale labelled data. However, current deep learning-based AAR models are trained solely on datasets of individual animal species, constraining their applicability in practice and performing poorly when training data are limited. In this study, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16644v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16644v1-abstract-full" style="display: none;"> Deep learning techniques are dominating automated animal activity recognition (AAR) tasks with wearable sensors due to their high performance on large-scale labelled data. However, current deep learning-based AAR models are trained solely on datasets of individual animal species, constraining their applicability in practice and performing poorly when training data are limited. In this study, we propose a one-for-many framework, dubbed Cross-species Knowledge Sharing and Preserving (CKSP), based on sensor data of diverse animal species. Given the coexistence of generic and species-specific behavioural patterns among different species, we design a Shared-Preserved Convolution (SPConv) module. This module assigns an individual low-rank convolutional layer to each species for extracting species-specific features and employs a shared full-rank convolutional layer to learn generic features, enabling the CKSP framework to learn inter-species complementarity and alleviating data limitations via increasing data diversity. Considering the training conflict arising from discrepancies in data distributions among species, we devise a Species-specific Batch Normalization (SBN) module, that involves multiple BN layers to separately fit the distributions of different species. To validate CKSP's effectiveness, experiments are performed on three public datasets from horses, sheep, and cattle, respectively. The results show that our approach remarkably boosts the classification performance compared to the baseline method (one-for-one framework) solely trained on individual-species data, with increments of 6.04%, 2.06%, and 3.66% in accuracy, and 10.33%, 3.67%, and 7.90% in F1-score for the horse, sheep, and cattle datasets, respectively. This proves the promising capabilities of our method in leveraging multi-species data to augment classification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16644v1-abstract-full').style.display = 'none'; document.getElementById('2410.16644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15612">arXiv:2410.15612</a> <span> [<a href="https://arxiv.org/pdf/2410.15612">pdf</a>, <a href="https://arxiv.org/format/2410.15612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> In-Trajectory Inverse Reinforcement Learning: Learn Incrementally Before An Ongoing Trajectory Terminates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shicheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minghui Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15612v2-abstract-short" style="display: inline;"> Inverse reinforcement learning (IRL) aims to learn a reward function and a corresponding policy that best fit the demonstrated trajectories of an expert. However, current IRL works cannot learn incrementally from an ongoing trajectory because they have to wait to collect at least one complete trajectory to learn. To bridge the gap, this paper considers the problem of learning a reward function and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15612v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15612v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15612v2-abstract-full" style="display: none;"> Inverse reinforcement learning (IRL) aims to learn a reward function and a corresponding policy that best fit the demonstrated trajectories of an expert. However, current IRL works cannot learn incrementally from an ongoing trajectory because they have to wait to collect at least one complete trajectory to learn. To bridge the gap, this paper considers the problem of learning a reward function and a corresponding policy while observing the initial state-action pair of an ongoing trajectory and keeping updating the learned reward and policy when new state-action pairs of the ongoing trajectory are observed. We formulate this problem as an online bi-level optimization problem where the upper level dynamically adjusts the learned reward according to the newly observed state-action pairs with the help of a meta-regularization term, and the lower level learns the corresponding policy. We propose a novel algorithm to solve this problem and guarantee that the algorithm achieves sub-linear local regret $O(\sqrt{T}+\log T+\sqrt{T}\log T)$. If the reward function is linear, we prove that the proposed algorithm achieves sub-linear regret $O(\log T)$. Experiments are used to validate the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15612v2-abstract-full').style.display = 'none'; document.getElementById('2410.15612v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15319">arXiv:2410.15319</a> <span> [<a href="https://arxiv.org/pdf/2410.15319">pdf</a>, <a href="https://arxiv.org/format/2410.15319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Causality for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+A">Anpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minqin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingrong Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kairong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Baohong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15319v1-abstract-short" style="display: inline;"> Recent breakthroughs in artificial intelligence have driven a paradigm shift, where large language models (LLMs) with billions or trillions of parameters are trained on vast datasets, achieving unprecedented success across a series of language tasks. However, despite these successes, LLMs still rely on probabilistic modeling, which often captures spurious correlations rooted in linguistic patterns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15319v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15319v1-abstract-full" style="display: none;"> Recent breakthroughs in artificial intelligence have driven a paradigm shift, where large language models (LLMs) with billions or trillions of parameters are trained on vast datasets, achieving unprecedented success across a series of language tasks. However, despite these successes, LLMs still rely on probabilistic modeling, which often captures spurious correlations rooted in linguistic patterns and social stereotypes, rather than the true causal relationships between entities and events. This limitation renders LLMs vulnerable to issues such as demographic biases, social stereotypes, and LLM hallucinations. These challenges highlight the urgent need to integrate causality into LLMs, moving beyond correlation-driven paradigms to build more reliable and ethically aligned AI systems. While many existing surveys and studies focus on utilizing prompt engineering to activate LLMs for causal knowledge or developing benchmarks to assess their causal reasoning abilities, most of these efforts rely on human intervention to activate pre-trained models. How to embed causality into the training process of LLMs and build more general and intelligent models remains unexplored. Recent research highlights that LLMs function as causal parrots, capable of reciting causal knowledge without truly understanding or applying it. These prompt-based methods are still limited to human interventional improvements. This survey aims to address this gap by exploring how causality can enhance LLMs at every stage of their lifecycle-from token embedding learning and foundation model training to fine-tuning, alignment, inference, and evaluation-paving the way for more interpretable, reliable, and causally-informed models. Additionally, we further outline six promising future directions to advance LLM development, enhance their causal reasoning capabilities, and address the current limitations these models face. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15319v1-abstract-full').style.display = 'none'; document.getElementById('2410.15319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13586">arXiv:2410.13586</a> <span> [<a href="https://arxiv.org/pdf/2410.13586">pdf</a>, <a href="https://arxiv.org/format/2410.13586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Preference Aligned Diffusion Planner for Quadrupedal Locomotion Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xinyi Yuan</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Z">Zhiwei Shang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenkai Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zhao Shan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhenchao Qi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+C">Chenjia Bai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13586v1-abstract-short" style="display: inline;"> Diffusion models demonstrate superior performance in capturing complex distributions from large-scale datasets, providing a promising solution for quadrupedal locomotion control. However, offline policy is sensitive to Out-of-Distribution (OOD) states due to the limited state coverage in the datasets. In this work, we propose a two-stage learning framework combining offline learning and online pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13586v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13586v1-abstract-full" style="display: none;"> Diffusion models demonstrate superior performance in capturing complex distributions from large-scale datasets, providing a promising solution for quadrupedal locomotion control. However, offline policy is sensitive to Out-of-Distribution (OOD) states due to the limited state coverage in the datasets. In this work, we propose a two-stage learning framework combining offline learning and online preference alignment for legged locomotion control. Through the offline stage, the diffusion planner learns the joint distribution of state-action sequences from expert datasets without using reward labels. Subsequently, we perform the online interaction in the simulation environment based on the trained offline planer, which significantly addresses the OOD issues and improves the robustness. Specifically, we propose a novel weak preference labeling method without the ground-truth reward or human preferences. The proposed method exhibits superior stability and velocity tracking accuracy in pacing, trotting, and bounding gait under both slow- and high-speed scenarios and can perform zero-shot transfer to the real Unitree Go1 robots. The project website for this paper is at https://shangjaven.github.io/preference-aligned-diffusion-legged/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13586v1-abstract-full').style.display = 'none'; document.getElementById('2410.13586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12926">arXiv:2410.12926</a> <span> [<a href="https://arxiv.org/pdf/2410.12926">pdf</a>, <a href="https://arxiv.org/format/2410.12926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DEeR: Deviation Eliminating and Noise Regulating for Privacy-preserving Federated Low-rank Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meilu Zhu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+A">Axiu Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12926v1-abstract-short" style="display: inline;"> Integrating low-rank adaptation (LoRA) with federated learning (FL) has received widespread attention recently, aiming to adapt pretrained foundation models (FMs) to downstream medical tasks via privacy-preserving decentralized training. However, owing to the direct combination of LoRA and FL, current methods generally undergo two problems, i.e., aggregation deviation, and differential privacy (DP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12926v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12926v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12926v1-abstract-full" style="display: none;"> Integrating low-rank adaptation (LoRA) with federated learning (FL) has received widespread attention recently, aiming to adapt pretrained foundation models (FMs) to downstream medical tasks via privacy-preserving decentralized training. However, owing to the direct combination of LoRA and FL, current methods generally undergo two problems, i.e., aggregation deviation, and differential privacy (DP) noise amplification effect. To address these problems, we propose a novel privacy-preserving federated finetuning framework called \underline{D}eviation \underline{E}liminating and Nois\underline{e} \underline{R}egulating (DEeR). Specifically, we firstly theoretically prove that the necessary condition to eliminate aggregation deviation is guaranteing the equivalence between LoRA parameters of clients. Based on the theoretical insight, a deviation eliminator is designed to utilize alternating minimization algorithm to iteratively optimize the zero-initialized and non-zero-initialized parameter matrices of LoRA, ensuring that aggregation deviation always be zeros during training. Furthermore, we also conduct an in-depth analysis of the noise amplification effect and find that this problem is mainly caused by the ``linear relationship'' between DP noise and LoRA parameters. To suppress the noise amplification effect, we propose a noise regulator that exploits two regulator factors to decouple relationship between DP and LoRA, thereby achieving robust privacy protection and excellent finetuning performance. Additionally, we perform comprehensive ablated experiments to verify the effectiveness of the deviation eliminator and noise regulator. DEeR shows better performance on public medical datasets in comparison with state-of-the-art approaches. The code is available at https://github.com/CUHK-AIM-Group/DEeR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12926v1-abstract-full').style.display = 'none'; document.getElementById('2410.12926v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11783">arXiv:2410.11783</a> <span> [<a href="https://arxiv.org/pdf/2410.11783">pdf</a>, <a href="https://arxiv.org/format/2410.11783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Latent BKI: Open-Dictionary Continuous Mapping in Visual-Language Latent Spaces with Quantifiable Uncertainty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wilson%2C+J">Joey Wilson</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruihan Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yile Sun</a>, <a href="/search/cs?searchtype=author&query=Ewen%2C+P">Parker Ewen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minghan Zhu</a>, <a href="/search/cs?searchtype=author&query=Barton%2C+K">Kira Barton</a>, <a href="/search/cs?searchtype=author&query=Ghaffari%2C+M">Maani Ghaffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11783v1-abstract-short" style="display: inline;"> This paper introduces a novel probabilistic mapping algorithm, Latent BKI, which enables open-vocabulary mapping with quantifiable uncertainty. Traditionally, semantic mapping algorithms focus on a fixed set of semantic categories which limits their applicability for complex robotic tasks. Vision-Language (VL) models have recently emerged as a technique to jointly model language and visual feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11783v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11783v1-abstract-full" style="display: none;"> This paper introduces a novel probabilistic mapping algorithm, Latent BKI, which enables open-vocabulary mapping with quantifiable uncertainty. Traditionally, semantic mapping algorithms focus on a fixed set of semantic categories which limits their applicability for complex robotic tasks. Vision-Language (VL) models have recently emerged as a technique to jointly model language and visual features in a latent space, enabling semantic recognition beyond a predefined, fixed set of semantic classes. Latent BKI recurrently incorporates neural embeddings from VL models into a voxel map with quantifiable uncertainty, leveraging the spatial correlations of nearby observations through Bayesian Kernel Inference (BKI). Latent BKI is evaluated against similar explicit semantic mapping and VL mapping frameworks on the popular MatterPort-3D and Semantic KITTI data sets, demonstrating that Latent BKI maintains the probabilistic benefits of continuous mapping with the additional benefit of open-dictionary queries. Real-world experiments demonstrate applicability to challenging indoor environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11783v1-abstract-full').style.display = 'none'; document.getElementById('2410.11783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10832">arXiv:2410.10832</a> <span> [<a href="https://arxiv.org/pdf/2410.10832">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Non-Interrupting Rail Track Geometry Measurement System Using UAV and LiDAR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lihao Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">JeeWoong Park</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yingtao Jiang</a>, <a href="/search/cs?searchtype=author&query=Hualiang"> Hualiang</a>, <a href="/search/cs?searchtype=author&query=Teng"> Teng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10832v2-abstract-short" style="display: inline;"> The safety of train operations is largely dependent on the health of rail tracks, necessitating regular and meticulous inspection and maintenance. A significant part of such inspections involves geometric measurements of the tracks to detect any potential problems. Traditional methods for track geometry measurements, while proven to be accurate, require track closures during inspections, and consu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10832v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10832v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10832v2-abstract-full" style="display: none;"> The safety of train operations is largely dependent on the health of rail tracks, necessitating regular and meticulous inspection and maintenance. A significant part of such inspections involves geometric measurements of the tracks to detect any potential problems. Traditional methods for track geometry measurements, while proven to be accurate, require track closures during inspections, and consume a considerable amount of time as the inspection area grows, causing significant disruptions to regular operations. To address this challenge, this paper proposes a track geometry measurement system (TGMS) that utilizes an unmanned aerial vehicle (UAV) platform equipped with a light detection and ranging (LiDAR) sensor. Integrated with a state-of-the-art machine-learning-based computer vision algorithm, and a simultaneous localization and mapping (SLAM) algorithm, this platform can conduct rail geometry inspections seamlessly over a larger area without interrupting rail operations. In particular, this semi- or fully automated measurement is found capable of measuring critical rail geometry irregularities in gauge, curvature, and profile with sub-inch accuracy. Cross-level and warp are not measured due to the absence of gravity data. By eliminating operational interruptions, our system offers a more streamlined, cost-effective, and safer solution for inspecting and maintaining rail infrastructure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10832v2-abstract-full').style.display = 'none'; document.getElementById('2410.10832v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10620">arXiv:2410.10620</a> <span> [<a href="https://arxiv.org/pdf/2410.10620">pdf</a>, <a href="https://arxiv.org/ps/2410.10620">ps</a>, <a href="https://arxiv.org/format/2410.10620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> On the sparsity of binary numbers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meijun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10620v1-abstract-short" style="display: inline;"> We introduce the concept of negative coefficients in various number-based systems, with a focus on decimal and binary systems. We demonstrate that every binary number can be transformed into a sparse form, significantly enhancing computational speed by converting binary numbers into this form. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10620v1-abstract-full" style="display: none;"> We introduce the concept of negative coefficients in various number-based systems, with a focus on decimal and binary systems. We demonstrate that every binary number can be transformed into a sparse form, significantly enhancing computational speed by converting binary numbers into this form. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10620v1-abstract-full').style.display = 'none'; document.getElementById('2410.10620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages. While it is not directly within my original research focus on PDEs, it is connected to areas of Math education and computational efficiency</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 90C09; 90C10; 97H20 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10589">arXiv:2410.10589</a> <span> [<a href="https://arxiv.org/pdf/2410.10589">pdf</a>, <a href="https://arxiv.org/format/2410.10589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoTE: Reconciling Generalization with Specialization for Visual-Language to Video Knowledge Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengpu Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mengxian Hu</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+R">Ronghao Dang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chengju Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qijun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10589v1-abstract-short" style="display: inline;"> Transferring visual-language knowledge from large-scale foundation models for video recognition has proved to be effective. To bridge the domain gap, additional parametric modules are added to capture the temporal information. However, zero-shot generalization diminishes with the increase in the number of specialized parameters, making existing works a trade-off between zero-shot and close-set per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10589v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10589v1-abstract-full" style="display: none;"> Transferring visual-language knowledge from large-scale foundation models for video recognition has proved to be effective. To bridge the domain gap, additional parametric modules are added to capture the temporal information. However, zero-shot generalization diminishes with the increase in the number of specialized parameters, making existing works a trade-off between zero-shot and close-set performance. In this paper, we present MoTE, a novel framework that enables generalization and specialization to be balanced in one unified model. Our approach tunes a mixture of temporal experts to learn multiple task views with various degrees of data fitting. To maximally preserve the knowledge of each expert, we propose \emph{Weight Merging Regularization}, which regularizes the merging process of experts in weight space. Additionally with temporal feature modulation to regularize the contribution of temporal feature during test. We achieve a sound balance between zero-shot and close-set video recognition tasks and obtain state-of-the-art or competitive results on various datasets, including Kinetics-400 \& 600, UCF, and HMDB. Code is available at \url{https://github.com/ZMHH-H/MoTE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10589v1-abstract-full').style.display = 'none'; document.getElementById('2410.10589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Camera Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10343">arXiv:2410.10343</a> <span> [<a href="https://arxiv.org/pdf/2410.10343">pdf</a>, <a href="https://arxiv.org/format/2410.10343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Locking Down the Finetuned LLMs Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yifan Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10343v1-abstract-short" style="display: inline;"> Fine-tuning large language models (LLMs) on additional datasets is often necessary to optimize them for specific downstream tasks. However, existing safety alignment measures, which restrict harmful behavior during inference, are insufficient to mitigate safety risks during fine-tuning. Alarmingly, fine-tuning with just 10 toxic sentences can make models comply with harmful instructions. We introd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10343v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10343v1-abstract-full" style="display: none;"> Fine-tuning large language models (LLMs) on additional datasets is often necessary to optimize them for specific downstream tasks. However, existing safety alignment measures, which restrict harmful behavior during inference, are insufficient to mitigate safety risks during fine-tuning. Alarmingly, fine-tuning with just 10 toxic sentences can make models comply with harmful instructions. We introduce SafetyLock, a novel alignment intervention method that maintains robust safety post-fine-tuning through efficient and transferable mechanisms. SafetyLock leverages our discovery that fine-tuned models retain similar safety-related activation representations to their base models. This insight enables us to extract what we term the Meta-SafetyLock, a set of safety bias directions representing key activation patterns associated with safe responses in the original model. We can then apply these directions universally to fine-tuned models to enhance their safety. By searching for activation directions across multiple token dimensions, SafetyLock achieves enhanced robustness and transferability. SafetyLock re-aligns fine-tuned models in under 0.01 seconds without additional computational cost. Our experiments demonstrate that SafetyLock can reduce the harmful instruction response rate from 60% to below 1% in toxic fine-tuned models. It surpasses traditional methods in both performance and efficiency, offering a scalable, non-invasive solution for ensuring the safety of customized LLMs. Our analysis across various fine-tuning scenarios confirms SafetyLock's robustness, advocating its integration into safety protocols for aligned LLMs. The code is released at https://github.com/zhu-minjun/SafetyLock. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10343v1-abstract-full').style.display = 'none'; document.getElementById('2410.10343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09728">arXiv:2410.09728</a> <span> [<a href="https://arxiv.org/pdf/2410.09728">pdf</a>, <a href="https://arxiv.org/format/2410.09728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Meta-Reinforcement Learning with Universal Policy Adaptation: Provable Near-Optimality under All-task Optimum Comparator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Siyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minghui Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09728v1-abstract-short" style="display: inline;"> Meta-reinforcement learning (Meta-RL) has attracted attention due to its capability to enhance reinforcement learning (RL) algorithms, in terms of data efficiency and generalizability. In this paper, we develop a bilevel optimization framework for meta-RL (BO-MRL) to learn the meta-prior for task-specific policy adaptation, which implements multiple-step policy optimization on one-time data collec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09728v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09728v1-abstract-full" style="display: none;"> Meta-reinforcement learning (Meta-RL) has attracted attention due to its capability to enhance reinforcement learning (RL) algorithms, in terms of data efficiency and generalizability. In this paper, we develop a bilevel optimization framework for meta-RL (BO-MRL) to learn the meta-prior for task-specific policy adaptation, which implements multiple-step policy optimization on one-time data collection. Beyond existing meta-RL analyses, we provide upper bounds of the expected optimality gap over the task distribution. This metric measures the distance of the policy adaptation from the learned meta-prior to the task-specific optimum, and quantifies the model's generalizability to the task distribution. We empirically validate the correctness of the derived upper bounds and demonstrate the superior effectiveness of the proposed algorithm over benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09728v1-abstract-full').style.display = 'none'; document.getElementById('2410.09728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08877">arXiv:2410.08877</a> <span> [<a href="https://arxiv.org/pdf/2410.08877">pdf</a>, <a href="https://arxiv.org/format/2410.08877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Interdependency Matters: Graph Alignment for Multivariate Time Series Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuanyi Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haifeng Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengsen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengde Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Z">Zirui Zhuang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08877v1-abstract-short" style="display: inline;"> Anomaly detection in multivariate time series (MTS) is crucial for various applications in data mining and industry. Current industrial methods typically approach anomaly detection as an unsupervised learning task, aiming to identify deviations by estimating the normal distribution in noisy, label-free datasets. These methods increasingly incorporate interdependencies between channels through grap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08877v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08877v1-abstract-full" style="display: none;"> Anomaly detection in multivariate time series (MTS) is crucial for various applications in data mining and industry. Current industrial methods typically approach anomaly detection as an unsupervised learning task, aiming to identify deviations by estimating the normal distribution in noisy, label-free datasets. These methods increasingly incorporate interdependencies between channels through graph structures to enhance accuracy. However, the role of interdependencies is more critical than previously understood, as shifts in interdependencies between MTS channels from normal to anomalous data are significant. This observation suggests that \textit{anomalies could be detected by changes in these interdependency graph series}. To capitalize on this insight, we introduce MADGA (MTS Anomaly Detection via Graph Alignment), which redefines anomaly detection as a graph alignment (GA) problem that explicitly utilizes interdependencies for anomaly detection. MADGA dynamically transforms subsequences into graphs to capture the evolving interdependencies, and Graph alignment is performed between these graphs, optimizing an alignment plan that minimizes cost, effectively minimizing the distance for normal data and maximizing it for anomalous data. Uniquely, our GA approach involves explicit alignment of both nodes and edges, employing Wasserstein distance for nodes and Gromov-Wasserstein distance for edges. To our knowledge, this is the first application of GA to MTS anomaly detection that explicitly leverages interdependency for this purpose. Extensive experiments on diverse real-world datasets validate the effectiveness of MADGA, demonstrating its capability to detect anomalies and differentiate interdependencies, consistently achieving state-of-the-art across various scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08877v1-abstract-full').style.display = 'none'; document.getElementById('2410.08877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07758">arXiv:2410.07758</a> <span> [<a href="https://arxiv.org/pdf/2410.07758">pdf</a>, <a href="https://arxiv.org/format/2410.07758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HeightFormer: A Semantic Alignment Monocular 3D Object Detection Method from Roadside Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+N">Nanfang Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Ziyuan Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07758v2-abstract-short" style="display: inline;"> The on-board 3D object detection technology has received extensive attention as a critical technology for autonomous driving, while few studies have focused on applying roadside sensors in 3D traffic object detection. Existing studies achieve the projection of 2D image features to 3D features through height estimation based on the frustum. However, they did not consider the height alignment and th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07758v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07758v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07758v2-abstract-full" style="display: none;"> The on-board 3D object detection technology has received extensive attention as a critical technology for autonomous driving, while few studies have focused on applying roadside sensors in 3D traffic object detection. Existing studies achieve the projection of 2D image features to 3D features through height estimation based on the frustum. However, they did not consider the height alignment and the extraction efficiency of bird's-eye-view features. We propose a novel 3D object detection framework integrating Spatial Former and Voxel Pooling Former to enhance 2D-to-3D projection based on height estimation. Extensive experiments were conducted using the Rope3D and DAIR-V2X-I dataset, and the results demonstrated the outperformance of the proposed algorithm in the detection of both vehicles and cyclists. These results indicate that the algorithm is robust and generalized under various detection scenarios. Improving the accuracy of 3D object detection on the roadside is conducive to building a safe and trustworthy intelligent transportation system of vehicle-road coordination and promoting the large-scale application of autonomous driving. The code and pre-trained models will be released on https://anonymous.4open.science/r/HeightFormer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07758v2-abstract-full').style.display = 'none'; document.getElementById('2410.07758v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06158">arXiv:2410.06158</a> <span> [<a href="https://arxiv.org/pdf/2410.06158">pdf</a>, <a href="https://arxiv.org/format/2410.06158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheang%2C+C">Chi-Lam Cheang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangzeng Chen</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Y">Ya Jing</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+T">Tao Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxiao Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hongtao Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiafeng Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yichu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minzhao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06158v1-abstract-short" style="display: inline;"> We present GR-2, a state-of-the-art generalist robot agent for versatile and generalizable robot manipulation. GR-2 is first pre-trained on a vast number of Internet videos to capture the dynamics of the world. This large-scale pre-training, involving 38 million video clips and over 50 billion tokens, equips GR-2 with the ability to generalize across a wide range of robotic tasks and environments… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06158v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06158v1-abstract-full" style="display: none;"> We present GR-2, a state-of-the-art generalist robot agent for versatile and generalizable robot manipulation. GR-2 is first pre-trained on a vast number of Internet videos to capture the dynamics of the world. This large-scale pre-training, involving 38 million video clips and over 50 billion tokens, equips GR-2 with the ability to generalize across a wide range of robotic tasks and environments during subsequent policy learning. Following this, GR-2 is fine-tuned for both video generation and action prediction using robot trajectories. It exhibits impressive multi-task learning capabilities, achieving an average success rate of 97.7% across more than 100 tasks. Moreover, GR-2 demonstrates exceptional generalization to new, previously unseen scenarios, including novel backgrounds, environments, objects, and tasks. Notably, GR-2 scales effectively with model size, underscoring its potential for continued growth and application. Project page: \url{https://gr2-manipulation.github.io}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06158v1-abstract-full').style.display = 'none'; document.getElementById('2410.06158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report. Authors are listed in alphabetical order. Project page: https://gr2-manipulation.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04842">arXiv:2410.04842</a> <span> [<a href="https://arxiv.org/pdf/2410.04842">pdf</a>, <a href="https://arxiv.org/format/2410.04842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple Image Segmentation Framework via In-Context Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+C">Chenchen Jing</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hengtao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Muzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04842v2-abstract-short" style="display: inline;"> Recently, there have been explorations of generalist segmentation models that can effectively tackle a variety of image segmentation tasks within a unified in-context learning framework. However, these methods still struggle with task ambiguity in in-context segmentation, as not all in-context examples can accurately convey the task information. In order to address this issue, we present SINE, a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04842v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04842v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04842v2-abstract-full" style="display: none;"> Recently, there have been explorations of generalist segmentation models that can effectively tackle a variety of image segmentation tasks within a unified in-context learning framework. However, these methods still struggle with task ambiguity in in-context segmentation, as not all in-context examples can accurately convey the task information. In order to address this issue, we present SINE, a simple image Segmentation framework utilizing in-context examples. Our approach leverages a Transformer encoder-decoder structure, where the encoder provides high-quality image representations, and the decoder is designed to yield multiple task-specific output masks to effectively eliminate task ambiguity. Specifically, we introduce an In-context Interaction module to complement in-context information and produce correlations between the target image and the in-context example and a Matching Transformer that uses fixed matching and a Hungarian algorithm to eliminate differences between different tasks. In addition, we have further perfected the current evaluation system for in-context image segmentation, aiming to facilitate a holistic appraisal of these models. Experiments on various segmentation tasks show the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04842v2-abstract-full').style.display = 'none'; document.getElementById('2410.04842v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Proc. Conference on Neural Information Processing Systems (NeurIPS) 2024. Webpage: https://github.com/aim-uofa/SINE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02847">arXiv:2410.02847</a> <span> [<a href="https://arxiv.org/pdf/2410.02847">pdf</a>, <a href="https://arxiv.org/format/2410.02847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Signature: Characterization of Large-Scale Molecular Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tiexin Qin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyang Li</a>, <a href="/search/cs?searchtype=author&query=Lyons%2C+T">Terry Lyons</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02847v1-abstract-short" style="display: inline;"> Understanding protein dynamics are essential for deciphering protein functional mechanisms and developing molecular therapies. However, the complex high-dimensional dynamics and interatomic interactions of biological processes pose significant challenge for existing computational techniques. In this paper, we approach this problem for the first time by introducing Deep Signature, a novel computati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02847v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02847v1-abstract-full" style="display: none;"> Understanding protein dynamics are essential for deciphering protein functional mechanisms and developing molecular therapies. However, the complex high-dimensional dynamics and interatomic interactions of biological processes pose significant challenge for existing computational techniques. In this paper, we approach this problem for the first time by introducing Deep Signature, a novel computationally tractable framework that characterizes complex dynamics and interatomic interactions based on their evolving trajectories. Specifically, our approach incorporates soft spectral clustering that locally aggregates cooperative dynamics to reduce the size of the system, as well as signature transform that collects iterated integrals to provide a global characterization of the non-smooth interactive dynamics. Theoretical analysis demonstrates that Deep Signature exhibits several desirable properties, including invariance to translation, near invariance to rotation, equivariance to permutation of atomic coordinates, and invariance under time reparameterization. Furthermore, experimental results on three benchmarks of biological processes verify that our approach can achieve superior performance compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02847v1-abstract-full').style.display = 'none'; document.getElementById('2410.02847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 page, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02369">arXiv:2410.02369</a> <span> [<a href="https://arxiv.org/pdf/2410.02369">pdf</a>, <a href="https://arxiv.org/format/2410.02369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of the Diffusion Model in Few-shot Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Muzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zekai Luo</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+C">Chenchen Jing</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guangkai Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02369v3-abstract-short" style="display: inline;"> The Diffusion Model has not only garnered noteworthy achievements in the realm of image generation but has also demonstrated its potential as an effective pretraining method utilizing unlabeled data. Drawing from the extensive potential unveiled by the Diffusion Model in both semantic correspondence and open vocabulary segmentation, our work initiates an investigation into employing the Latent Dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02369v3-abstract-full').style.display = 'inline'; document.getElementById('2410.02369v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02369v3-abstract-full" style="display: none;"> The Diffusion Model has not only garnered noteworthy achievements in the realm of image generation but has also demonstrated its potential as an effective pretraining method utilizing unlabeled data. Drawing from the extensive potential unveiled by the Diffusion Model in both semantic correspondence and open vocabulary segmentation, our work initiates an investigation into employing the Latent Diffusion Model for Few-shot Semantic Segmentation. Recently, inspired by the in-context learning ability of large language models, Few-shot Semantic Segmentation has evolved into In-context Segmentation tasks, morphing into a crucial element in assessing generalist segmentation models. In this context, we concentrate on Few-shot Semantic Segmentation, establishing a solid foundation for the future development of a Diffusion-based generalist model for segmentation. Our initial focus lies in understanding how to facilitate interaction between the query image and the support image, resulting in the proposal of a KV fusion method within the self-attention framework. Subsequently, we delve deeper into optimizing the infusion of information from the support mask and simultaneously re-evaluating how to provide reasonable supervision from the query mask. Based on our analysis, we establish a simple and effective framework named DiffewS, maximally retaining the original Latent Diffusion Model's generative framework and effectively utilizing the pre-training prior. Experimental results demonstrate that our method significantly outperforms the previous SOTA models in multiple settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02369v3-abstract-full').style.display = 'none'; document.getElementById('2410.02369v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Proc. Annual Conference on Neural Information Processing Systems (NeurIPS) 2024. Webpage: https://github.com/aim-uofa/DiffewS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00508">arXiv:2410.00508</a> <span> [<a href="https://arxiv.org/pdf/2410.00508">pdf</a>, <a href="https://arxiv.org/format/2410.00508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FlipGuard: Defending Preference Alignment against Update Regression with Constrained Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingye Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Quan Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junbo Guo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zhendong Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00508v2-abstract-short" style="display: inline;"> Recent breakthroughs in preference alignment have significantly improved Large Language Models' ability to generate texts that align with human preferences and values. However, current alignment metrics typically emphasize the post-hoc overall improvement, while overlooking a critical aspect: regression, which refers to the backsliding on previously correctly-handled data after updates. This poten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00508v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00508v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00508v2-abstract-full" style="display: none;"> Recent breakthroughs in preference alignment have significantly improved Large Language Models' ability to generate texts that align with human preferences and values. However, current alignment metrics typically emphasize the post-hoc overall improvement, while overlooking a critical aspect: regression, which refers to the backsliding on previously correctly-handled data after updates. This potential pitfall may arise from excessive fine-tuning on already well-aligned data, which subsequently leads to over-alignment and degeneration. To address this challenge, we propose FlipGuard, a constrained optimization approach to detect and mitigate update regression with focal attention. Specifically, FlipGuard identifies performance degradation using a customized reward characterization and strategically enforces a constraint to encourage conditional congruence with the pre-aligned model during training. Comprehensive experiments demonstrate that FlipGuard effectively alleviates update regression while demonstrating excellent overall performance, with the added benefit of knowledge preservation while aligning preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00508v2-abstract-full').style.display = 'none'; document.getElementById('2410.00508v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Main track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19589">arXiv:2409.19589</a> <span> [<a href="https://arxiv.org/pdf/2409.19589">pdf</a>, <a href="https://arxiv.org/format/2409.19589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Effective Diffusion Transformer Architecture for Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Kun Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lei Yu</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhijun Tu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiao He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyu Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingrui Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19589v1-abstract-short" style="display: inline;"> Recent advances indicate that diffusion models hold great promise in image super-resolution. While the latest methods are primarily based on latent diffusion models with convolutional neural networks, there are few attempts to explore transformers, which have demonstrated remarkable performance in image generation. In this work, we design an effective diffusion transformer for image super-resoluti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19589v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19589v1-abstract-full" style="display: none;"> Recent advances indicate that diffusion models hold great promise in image super-resolution. While the latest methods are primarily based on latent diffusion models with convolutional neural networks, there are few attempts to explore transformers, which have demonstrated remarkable performance in image generation. In this work, we design an effective diffusion transformer for image super-resolution (DiT-SR) that achieves the visual quality of prior-based methods, but through a training-from-scratch manner. In practice, DiT-SR leverages an overall U-shaped architecture, and adopts a uniform isotropic design for all the transformer blocks across different stages. The former facilitates multi-scale hierarchical feature extraction, while the latter reallocates the computational resources to critical layers to further enhance performance. Moreover, we thoroughly analyze the limitation of the widely used AdaLN, and present a frequency-adaptive time-step conditioning module, enhancing the model's capacity to process distinct frequency information at different time steps. Extensive experiments demonstrate that DiT-SR outperforms the existing training-from-scratch diffusion-based SR methods significantly, and even beats some of the prior-based methods on pretrained Stable Diffusion, proving the superiority of diffusion transformer in image super-resolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19589v1-abstract-full').style.display = 'none'; document.getElementById('2409.19589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/kunncheng/DiT-SR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16876">arXiv:2409.16876</a> <span> [<a href="https://arxiv.org/pdf/2409.16876">pdf</a>, <a href="https://arxiv.org/format/2409.16876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automating Traffic Model Enhancement with AI Research Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xusen Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinxi Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+M">Mingxing Peng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongliang Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hai Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16876v2-abstract-short" style="display: inline;"> Developing efficient traffic models is essential for optimizing transportation systems, yet current approaches remain time-intensive and susceptible to human errors due to their reliance on manual processes. Traditional workflows involve exhaustive literature reviews, formula optimization, and iterative testing, leading to inefficiencies in research. In response, we introduce the Traffic Research… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16876v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16876v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16876v2-abstract-full" style="display: none;"> Developing efficient traffic models is essential for optimizing transportation systems, yet current approaches remain time-intensive and susceptible to human errors due to their reliance on manual processes. Traditional workflows involve exhaustive literature reviews, formula optimization, and iterative testing, leading to inefficiencies in research. In response, we introduce the Traffic Research Agent (TR-Agent), an AI-driven system designed to autonomously develop and refine traffic models through an iterative, closed-loop process. Specifically, we divide the research pipeline into four key stages: idea generation, theory formulation, theory evaluation, and iterative optimization; and construct TR-Agent with four corresponding modules: Idea Generator, Code Generator, Evaluator, and Analyzer. Working in synergy, these modules retrieve knowledge from external resources, generate novel ideas, implement and debug models, and finally assess them on the evaluation datasets. Furthermore, the system continuously refines these models based on iterative feedback, enhancing research efficiency and model performance. Experimental results demonstrate that TR-Agent achieves significant performance improvements across multiple traffic models, including the Intelligent Driver Model (IDM) for car following, the MOBIL lane-changing model, and the Lighthill-Whitham-Richards (LWR) traffic flow model. Additionally, TR-Agent provides detailed explanations for its optimizations, allowing researchers to verify and build upon its improvements easily. This flexibility makes the framework a powerful tool for researchers in transportation and beyond. To further support research and collaboration, we have open-sourced both the code and data used in our experiments, facilitating broader access and enabling continued advancements in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16876v2-abstract-full').style.display = 'none'; document.getElementById('2409.16876v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">52 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16572">arXiv:2409.16572</a> <span> [<a href="https://arxiv.org/pdf/2409.16572">pdf</a>, <a href="https://arxiv.org/format/2409.16572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Efficient and generalizable nested Fourier-DeepONet for three-dimensional geological carbon sequestration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J+E">Jonathan E. Lee</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Z">Ziqiao Xi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y+O">Yanhua O. Yuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16572v1-abstract-short" style="display: inline;"> Geological carbon sequestration (GCS) involves injecting CO$_2$ into subsurface geological formations for permanent storage. Numerical simulations could guide decisions in GCS projects by predicting CO$_2$ migration pathways and the pressure distribution in storage formation. However, these simulations are often computationally expensive due to highly coupled physics and large spatial-temporal sim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16572v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16572v1-abstract-full" style="display: none;"> Geological carbon sequestration (GCS) involves injecting CO$_2$ into subsurface geological formations for permanent storage. Numerical simulations could guide decisions in GCS projects by predicting CO$_2$ migration pathways and the pressure distribution in storage formation. However, these simulations are often computationally expensive due to highly coupled physics and large spatial-temporal simulation domains. Surrogate modeling with data-driven machine learning has become a promising alternative to accelerate physics-based simulations. Among these, the Fourier neural operator (FNO) has been applied to three-dimensional synthetic subsurface models. Here, to further improve performance, we have developed a nested Fourier-DeepONet by combining the expressiveness of the FNO with the modularity of a deep operator network (DeepONet). This new framework is twice as efficient as a nested FNO for training and has at least 80% lower GPU memory requirement due to its flexibility to treat temporal coordinates separately. These performance improvements are achieved without compromising prediction accuracy. In addition, the generalization and extrapolation ability of nested Fourier-DeepONet beyond the training range has been thoroughly evaluated. Nested Fourier-DeepONet outperformed the nested FNO for extrapolation in time with more than 50% reduced error. It also exhibited good extrapolation accuracy beyond the training range in terms of reservoir properties, number of wells, and injection rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16572v1-abstract-full').style.display = 'none'; document.getElementById('2409.16572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16182">arXiv:2409.16182</a> <span> [<a href="https://arxiv.org/pdf/2409.16182">pdf</a>, <a href="https://arxiv.org/format/2409.16182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> TiM4Rec: An Efficient Sequential Recommendation Model Based on Time-Aware Structured State Space Duality Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hao Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengyi Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yanrong Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hailin Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhijie He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongjiu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16182v2-abstract-short" style="display: inline;"> Sequential recommendation represents a pivotal branch of recommendation systems, centered around dynamically analyzing the sequential dependencies between user preferences and their interactive behaviors. Despite the Transformer architecture-based models achieving commendable performance within this domain, their quadratic computational complexity relative to the sequence dimension impedes efficie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16182v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16182v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16182v2-abstract-full" style="display: none;"> Sequential recommendation represents a pivotal branch of recommendation systems, centered around dynamically analyzing the sequential dependencies between user preferences and their interactive behaviors. Despite the Transformer architecture-based models achieving commendable performance within this domain, their quadratic computational complexity relative to the sequence dimension impedes efficient modeling. In response, the innovative Mamba architecture, characterized by linear computational complexity, has emerged. Mamba4Rec further pioneers the application of Mamba in sequential recommendation. Nonetheless, Mamba 1's hardware-aware algorithm struggles to efficiently leverage modern matrix computational units, which lead to the proposal of the improved State Space Duality (SSD), also known as Mamba 2. While the SSD4Rec successfully adapts the SSD architecture for sequential recommendation, showing promising results in high-dimensional contexts, it suffers significant performance drops in low-dimensional scenarios crucial for pure ID sequential recommendation tasks. Addressing this challenge, we propose a novel sequential recommendation backbone model, TiM4Rec, which ameliorates the low-dimensional performance loss of the SSD architecture while preserving its computational efficiency. Drawing inspiration from TiSASRec, we develop a time-aware enhancement method tailored for the linear computation demands of the SSD architecture, thereby enhancing its adaptability and achieving state-of-the-art (SOTA) performance in both low and high-dimensional modeling. The code for our model is publicly accessible at https://github.com/AlwaysFHao/TiM4Rec. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16182v2-abstract-full').style.display = 'none'; document.getElementById('2409.16182v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16120">arXiv:2409.16120</a> <span> [<a href="https://arxiv.org/pdf/2409.16120">pdf</a>, <a href="https://arxiv.org/format/2409.16120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MOSS: Enabling Code-Driven Evolution and Context Management for AI Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16120v1-abstract-short" style="display: inline;"> Developing AI agents powered by large language models (LLMs) faces significant challenges in achieving true Turing completeness and adaptive, code-driven evolution. Current approaches often generate code independently of its runtime context, relying heavily on the LLM's memory, which results in inefficiencies and limits adaptability. Manual protocol development in sandbox environments further cons… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16120v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16120v1-abstract-full" style="display: none;"> Developing AI agents powered by large language models (LLMs) faces significant challenges in achieving true Turing completeness and adaptive, code-driven evolution. Current approaches often generate code independently of its runtime context, relying heavily on the LLM's memory, which results in inefficiencies and limits adaptability. Manual protocol development in sandbox environments further constrains the agent's autonomous adaptability. Crucially, achieving consistency in code and context across multi-turn interactions and ensuring isolation of local variables within each interaction remains an unsolved problem. We introduce MOSS (llM-oriented Operating System Simulation), a novel framework that addresses these challenges by integrating code generation with a dynamic context management system. MOSS ensures consistency and adaptability by using a mechanism that maintains the Python context across interactions, including isolation of local variables and preservation of runtime integrity. At its core, the framework employs an Inversion of Control (IoC) container in conjunction with decorators to enforce the least knowledge principle, allowing agents to focus on abstract interfaces rather than concrete implementations. This facilitates seamless integration of new tools and libraries, enables runtime instance replacement, and reduces prompt complexity, providing a "what you see is what you get" environment for the agent. Through a series of case studies, we show how this framework can enhance the efficiency and capabilities of agent development and highlight its advantages in moving towards Turing-complete agents capable of evolving through code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16120v1-abstract-full').style.display = 'none'; document.getElementById('2409.16120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14411">arXiv:2409.14411</a> <span> [<a href="https://arxiv.org/pdf/2409.14411">pdf</a>, <a href="https://arxiv.org/format/2409.14411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Scaling Diffusion Policy in Transformer to 1 Billion Parameters for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Ran Cheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14411v2-abstract-short" style="display: inline;"> Diffusion Policy is a powerful technique tool for learning end-to-end visuomotor robot control. It is expected that Diffusion Policy possesses scalability, a key attribute for deep neural networks, typically suggesting that increasing model size would lead to enhanced performance. However, our observations indicate that Diffusion Policy in transformer architecture (\DP) struggles to scale effectiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14411v2-abstract-full').style.display = 'inline'; document.getElementById('2409.14411v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14411v2-abstract-full" style="display: none;"> Diffusion Policy is a powerful technique tool for learning end-to-end visuomotor robot control. It is expected that Diffusion Policy possesses scalability, a key attribute for deep neural networks, typically suggesting that increasing model size would lead to enhanced performance. However, our observations indicate that Diffusion Policy in transformer architecture (\DP) struggles to scale effectively; even minor additions of layers can deteriorate training outcomes. To address this issue, we introduce Scalable Diffusion Transformer Policy for visuomotor learning. Our proposed method, namely \textbf{\methodname}, introduces two modules that improve the training dynamic of Diffusion Policy and allow the network to better handle multimodal action distribution. First, we identify that \DP~suffers from large gradient issues, making the optimization of Diffusion Policy unstable. To resolve this issue, we factorize the feature embedding of observation into multiple affine layers, and integrate it into the transformer blocks. Additionally, our utilize non-causal attention which allows the policy network to \enquote{see} future actions during prediction, helping to reduce compounding errors. We demonstrate that our proposed method successfully scales the Diffusion Policy from 10 million to 1 billion parameters. This new model, named \methodname, can effectively scale up the model size with improved performance and generalization. We benchmark \methodname~across 50 different tasks from MetaWorld and find that our largest \methodname~outperforms \DP~with an average improvement of 21.6\%. Across 7 real-world robot tasks, our ScaleDP demonstrates an average improvement of 36.25\% over DP-T on four single-arm tasks and 75\% on three bimanual tasks. We believe our work paves the way for scaling up models for visuomotor learning. The project page is available at scaling-diffusion-policy.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14411v2-abstract-full').style.display = 'none'; document.getElementById('2409.14411v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13716">arXiv:2409.13716</a> <span> [<a href="https://arxiv.org/pdf/2409.13716">pdf</a>, <a href="https://arxiv.org/format/2409.13716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Constrained Multi-Layer Contrastive Learning for Implicit Discourse Relationship Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiheng Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junhui Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Muhua Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13716v1-abstract-short" style="display: inline;"> Previous approaches to the task of implicit discourse relation recognition (IDRR) generally view it as a classification task. Even with pre-trained language models, like BERT and RoBERTa, IDRR still relies on complicated neural networks with multiple intermediate layers to proper capture the interaction between two discourse units. As a result, the outputs of these intermediate layers may have dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13716v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13716v1-abstract-full" style="display: none;"> Previous approaches to the task of implicit discourse relation recognition (IDRR) generally view it as a classification task. Even with pre-trained language models, like BERT and RoBERTa, IDRR still relies on complicated neural networks with multiple intermediate layers to proper capture the interaction between two discourse units. As a result, the outputs of these intermediate layers may have different capability in discriminating instances of different classes. To this end, we propose to adapt a supervised contrastive learning (CL) method, label- and instance-centered CL, to enhance representation learning. Moreover, we propose a novel constrained multi-layer CL approach to properly impose a constraint that the contrastive loss of higher layers should be smaller than that of lower layers. Experimental results on PDTB 2.0 and PDTB 3.0 show that our approach can significantly improve the performance on both multi-class classification and binary classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13716v1-abstract-full').style.display = 'none'; document.getElementById('2409.13716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13056">arXiv:2409.13056</a> <span> [<a href="https://arxiv.org/pdf/2409.13056">pdf</a>, <a href="https://arxiv.org/format/2409.13056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Chirality Palmprint Verification: Left is Right for the Right Palmprint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chengrui Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+T">Tiong-Sik Ng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=Teoh%2C+A+B+J">Andrew Beng Jin Teoh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13056v1-abstract-short" style="display: inline;"> Palmprint recognition has emerged as a prominent biometric authentication method, owing to its high discriminative power and user-friendly nature. This paper introduces a novel Cross-Chirality Palmprint Verification (CCPV) framework that challenges the conventional wisdom in traditional palmprint verification systems. Unlike existing methods that typically require storing both left and right palmp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13056v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13056v1-abstract-full" style="display: none;"> Palmprint recognition has emerged as a prominent biometric authentication method, owing to its high discriminative power and user-friendly nature. This paper introduces a novel Cross-Chirality Palmprint Verification (CCPV) framework that challenges the conventional wisdom in traditional palmprint verification systems. Unlike existing methods that typically require storing both left and right palmprints, our approach enables verification using either palm while storing only one palmprint template. The core of our CCPV framework lies in a carefully designed matching rule. This rule involves flipping both the gallery and query palmprints and calculating the average distance between each pair as the final matching distance. This approach effectively reduces matching variance and enhances overall system robustness. We introduce a novel cross-chirality loss function to construct a discriminative and robust cross-chirality feature space. This loss enforces representation consistency across four palmprint variants: left, right, flipped left, and flipped right. The resulting compact feature space, coupled with the model's enhanced discriminative representation capability, ensures robust performance across various scenarios. We conducted extensive experiments to validate the efficacy of our proposed method. The evaluation encompassed multiple public datasets and considered both closed-set and open-set settings. The results demonstrate the CCPV framework's effectiveness and highlight its potential for real-world applications in palmprint authentication systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13056v1-abstract-full').style.display = 'none'; document.getElementById('2409.13056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12514">arXiv:2409.12514</a> <span> [<a href="https://arxiv.org/pdf/2409.12514">pdf</a>, <a href="https://arxiv.org/format/2409.12514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Junjie Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinming Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kun Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Ran Cheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaomin Shen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yaxin Peng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12514v4-abstract-short" style="display: inline;"> Vision-Language-Action (VLA) models have shown remarkable potential in visuomotor control and instruction comprehension through end-to-end learning processes. However, current VLA models face significant challenges: they are slow during inference and require extensive pre-training on large amounts of robotic data, making real-world deployment difficult. In this paper, we introduce a new family of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12514v4-abstract-full').style.display = 'inline'; document.getElementById('2409.12514v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12514v4-abstract-full" style="display: none;"> Vision-Language-Action (VLA) models have shown remarkable potential in visuomotor control and instruction comprehension through end-to-end learning processes. However, current VLA models face significant challenges: they are slow during inference and require extensive pre-training on large amounts of robotic data, making real-world deployment difficult. In this paper, we introduce a new family of compact vision-language-action models, called TinyVLA, which offers two key advantages over existing VLA models: (1) faster inference speeds, and (2) improved data efficiency, eliminating the need for pre-training stage. Our framework incorporates two essential components to build TinyVLA: (1) initializing the policy backbone with robust, high-speed multimodal models, and (2) integrating a diffusion policy decoder during fine-tuning to enable precise robot actions. We conducted extensive evaluations of TinyVLA in both simulation and on real robots, demonstrating that our approach significantly outperforms the state-of-the-art VLA model, OpenVLA, in terms of speed and data efficiency, while delivering comparable or superior performance. Additionally, TinyVLA exhibits strong generalization capabilities across various dimensions, including language instructions, novel objects, unseen positions, changes in object appearance, background variations, and environmental shifts, often matching or exceeding the performance of OpenVLA. We believe that \methodname offers an interesting perspective on utilizing pre-trained multimodal models for policy learning. Our project is at https://tiny-vla.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12514v4-abstract-full').style.display = 'none'; document.getElementById('2409.12514v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">add more citations</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12412">arXiv:2409.12412</a> <span> [<a href="https://arxiv.org/pdf/2409.12412">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How to predict on-road air pollution based on street view images and machine learning: a quantitative analysis of the optimal strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hui Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengqin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenrui Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yonghong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12412v1-abstract-short" style="display: inline;"> On-road air pollution exhibits substantial variability over short distances due to emission sources, dilution, and physicochemical processes. Integrating mobile monitoring data with street view images (SVIs) holds promise for predicting local air pollution. However, algorithms, sampling strategies, and image quality introduce extra errors due to a lack of reliable references that quantify their ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12412v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12412v1-abstract-full" style="display: none;"> On-road air pollution exhibits substantial variability over short distances due to emission sources, dilution, and physicochemical processes. Integrating mobile monitoring data with street view images (SVIs) holds promise for predicting local air pollution. However, algorithms, sampling strategies, and image quality introduce extra errors due to a lack of reliable references that quantify their effects. To bridge this gap, we employed 314 taxis to monitor NO, NO2, PM2.5 and PM10 dynamically and sampled corresponding SVIs, aiming to develop a reliable strategy. We extracted SVI features from ~ 382,000 streetscape images, which were collected at various angles (0掳, 90掳, 180掳, 270掳) and ranges (buffers with radii of 100m, 200m, 300m, 400m, 500m). Also, three machine learning algorithms alongside the linear land-used regression (LUR) model were experimented with to explore the influences of different algorithms. Four typical image quality issues were identified and discussed. Generally, machine learning methods outperform linear LUR for estimating the four pollutants, with the ranking: random forest > XGBoost > neural network > LUR. Compared to single-angle sampling, the averaging strategy is an effective method to avoid bias of insufficient feature capture. Therefore, the optimal sampling strategy is to obtain SVIs at a 100m radius buffer and extract features using the averaging strategy. This approach achieved estimation results for each aggregation location with absolute errors almost less than 2.5 渭g/m^2 or ppb. Overexposure, blur, and underexposure led to image misjudgments and incorrect identifications, causing an overestimation of road features and underestimation of human-activity features, contributing to inaccurate NO, NO2, PM2.5 and PM10 estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12412v1-abstract-full').style.display = 'none'; document.getElementById('2409.12412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11694">arXiv:2409.11694</a> <span> [<a href="https://arxiv.org/pdf/2409.11694">pdf</a>, <a href="https://arxiv.org/format/2409.11694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> From Words to Wheels: Automated Style-Customized Policy Generation for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianda Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhenghan Cai</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pinlong Cai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11694v1-abstract-short" style="display: inline;"> Autonomous driving technology has witnessed rapid advancements, with foundation models improving interactivity and user experiences. However, current autonomous vehicles (AVs) face significant limitations in delivering command-based driving styles. Most existing methods either rely on predefined driving styles that require expert input or use data-driven techniques like Inverse Reinforcement Learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11694v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11694v1-abstract-full" style="display: none;"> Autonomous driving technology has witnessed rapid advancements, with foundation models improving interactivity and user experiences. However, current autonomous vehicles (AVs) face significant limitations in delivering command-based driving styles. Most existing methods either rely on predefined driving styles that require expert input or use data-driven techniques like Inverse Reinforcement Learning to extract styles from driving data. These approaches, though effective in some cases, face challenges: difficulty obtaining specific driving data for style matching (e.g., in Robotaxis), inability to align driving style metrics with user preferences, and limitations to pre-existing styles, restricting customization and generalization to new commands. This paper introduces Words2Wheels, a framework that automatically generates customized driving policies based on natural language user commands. Words2Wheels employs a Style-Customized Reward Function to generate a Style-Customized Driving Policy without relying on prior driving data. By leveraging large language models and a Driving Style Database, the framework efficiently retrieves, adapts, and generalizes driving styles. A Statistical Evaluation module ensures alignment with user preferences. Experimental results demonstrate that Words2Wheels outperforms existing methods in accuracy, generalization, and adaptability, offering a novel solution for customized AV driving behavior. Code and demo available at https://yokhon.github.io/Words2Wheels/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11694v1-abstract-full').style.display = 'none'; document.getElementById('2409.11694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09790">arXiv:2409.09790</a> <span> [<a href="https://arxiv.org/pdf/2409.09790">pdf</a>, <a href="https://arxiv.org/format/2409.09790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multiple Rotation Averaging with Constrained Reweighting Deep Matrix Factorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+N">Naiwen Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingchen Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09790v1-abstract-short" style="display: inline;"> Multiple rotation averaging plays a crucial role in computer vision and robotics domains. The conventional optimization-based methods optimize a nonlinear cost function based on certain noise assumptions, while most previous learning-based methods require ground truth labels in the supervised training process. Recognizing the handcrafted noise assumption may not be reasonable in all real-world sce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09790v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09790v1-abstract-full" style="display: none;"> Multiple rotation averaging plays a crucial role in computer vision and robotics domains. The conventional optimization-based methods optimize a nonlinear cost function based on certain noise assumptions, while most previous learning-based methods require ground truth labels in the supervised training process. Recognizing the handcrafted noise assumption may not be reasonable in all real-world scenarios, this paper proposes an effective rotation averaging method for mining data patterns in a learning manner while avoiding the requirement of labels. Specifically, we apply deep matrix factorization to directly solve the multiple rotation averaging problem in unconstrained linear space. For deep matrix factorization, we design a neural network model, which is explicitly low-rank and symmetric to better suit the background of multiple rotation averaging. Meanwhile, we utilize a spanning tree-based edge filtering to suppress the influence of rotation outliers. What's more, we also adopt a reweighting scheme and dynamic depth selection strategy to further improve the robustness. Our method synthesizes the merit of both optimization-based and learning-based methods. Experimental results on various datasets validate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09790v1-abstract-full').style.display = 'none'; document.getElementById('2409.09790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09780">arXiv:2409.09780</a> <span> [<a href="https://arxiv.org/pdf/2409.09780">pdf</a>, <a href="https://arxiv.org/ps/2409.09780">ps</a>, <a href="https://arxiv.org/format/2409.09780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LCOMM.2024.3462828">10.1109/LCOMM.2024.3462828 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Power Allocation for Finite-Blocklength IR-HARQ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+K">Kaiming Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaorui Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09780v1-abstract-short" style="display: inline;"> This letter concerns the power allocation across the multiple transmission rounds under the Incremental Redundancy Hybrid Automatic Repeat reQuest (IR-HARQ) policy, in pursuit of an energy-efficient way of fulfilling the outage probability target in the finite-blocklength regime. We start by showing that the optimization objective and the constraints of the above power allocation problem all depen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09780v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09780v1-abstract-full" style="display: none;"> This letter concerns the power allocation across the multiple transmission rounds under the Incremental Redundancy Hybrid Automatic Repeat reQuest (IR-HARQ) policy, in pursuit of an energy-efficient way of fulfilling the outage probability target in the finite-blocklength regime. We start by showing that the optimization objective and the constraints of the above power allocation problem all depend upon the outage probability. The main challenge then lies in the fact that the outage probability cannot be written analytically in terms of the power variables. To sidestep this difficulty, we propose a novel upper bound on the outage probability in the finite-blocklength regime, which is much tighter than the existing ones from the literature. Most importantly, by using this upper bound to approximate the outage probability, we can recast the original intractable power allocation problem into a geometric programming (GP) form--which can be efficiently solved by the standard method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09780v1-abstract-full').style.display = 'none'; document.getElementById('2409.09780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Communications Letters 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08010">arXiv:2409.08010</a> <span> [<a href="https://arxiv.org/pdf/2409.08010">pdf</a>, <a href="https://arxiv.org/format/2409.08010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multiplex Graph Contrastive Learning with Soft Negatives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhenhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minhong Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sijia Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weiran Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08010v1-abstract-short" style="display: inline;"> Graph Contrastive Learning (GCL) seeks to learn nodal or graph representations that contain maximal consistent information from graph-structured data. While node-level contrasting modes are dominating, some efforts commence to explore consistency across different scales. Yet, they tend to lose consistent information and be contaminated by disturbing features. Here, we introduce MUX-GCL, a novel cr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08010v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08010v1-abstract-full" style="display: none;"> Graph Contrastive Learning (GCL) seeks to learn nodal or graph representations that contain maximal consistent information from graph-structured data. While node-level contrasting modes are dominating, some efforts commence to explore consistency across different scales. Yet, they tend to lose consistent information and be contaminated by disturbing features. Here, we introduce MUX-GCL, a novel cross-scale contrastive learning paradigm that utilizes multiplex representations as effective patches. While this learning mode minimizes contaminating noises, a commensurate contrasting strategy using positional affinities further avoids information loss by correcting false negative pairs across scales. Extensive downstream experiments demonstrate that MUX-GCL yields multiple state-of-the-art results on public datasets. Our theoretical analysis further guarantees the new objective function as a stricter lower bound of mutual information of raw input features and output embeddings, which rationalizes this paradigm. Code is available at https://github.com/MUX-GCL/Code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08010v1-abstract-full').style.display = 'none'; document.getElementById('2409.08010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07902">arXiv:2409.07902</a> <span> [<a href="https://arxiv.org/pdf/2409.07902">pdf</a>, <a href="https://arxiv.org/format/2409.07902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conformal Distributed Remote Inference in Sensor Networks Under Reliability and Communication Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meiyi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zecchin%2C+M">Matteo Zecchin</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sangwoo Park</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Caili Guo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chunyan Feng</a>, <a href="/search/cs?searchtype=author&query=Popovski%2C+P">Petar Popovski</a>, <a href="/search/cs?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07902v1-abstract-short" style="display: inline;"> This paper presents communication-constrained distributed conformal risk control (CD-CRC) framework, a novel decision-making framework for sensor networks under communication constraints. Targeting multi-label classification problems, such as segmentation, CD-CRC dynamically adjusts local and global thresholds used to identify significant labels with the goal of ensuring a target false negative ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07902v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07902v1-abstract-full" style="display: none;"> This paper presents communication-constrained distributed conformal risk control (CD-CRC) framework, a novel decision-making framework for sensor networks under communication constraints. Targeting multi-label classification problems, such as segmentation, CD-CRC dynamically adjusts local and global thresholds used to identify significant labels with the goal of ensuring a target false negative rate (FNR), while adhering to communication capacity limits. CD-CRC builds on online exponentiated gradient descent to estimate the relative quality of the observations of different sensors, and on online conformal risk control (CRC) as a mechanism to control local and global thresholds. CD-CRC is proved to offer deterministic worst-case performance guarantees in terms of FNR and communication overhead, while the regret performance in terms of false positive rate (FPR) is characterized as a function of the key hyperparameters. Simulation results highlight the effectiveness of CD-CRC, particularly in communication resource-constrained environments, making it a valuable tool for enhancing the performance and reliability of distributed sensor networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07902v1-abstract-full').style.display = 'none'; document.getElementById('2409.07902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03215">arXiv:2409.03215</a> <span> [<a href="https://arxiv.org/pdf/2409.03215">pdf</a>, <a href="https://arxiv.org/format/2409.03215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> xLAM: A Family of Large Action Models to Empower AI Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haolin Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yihao Feng</a>, <a href="/search/cs?searchtype=author&query=Awalgaonkar%2C+T">Tulika Awalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+E">Eric Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03215v1-abstract-short" style="display: inline;"> Autonomous agents powered by large language models (LLMs) have attracted significant research interest. However, the open-source community faces many challenges in developing specialized models for agent tasks, driven by the scarcity of high-quality agent datasets and the absence of standard protocols in this area. We introduce and publicly release xLAM, a series of large action models designed fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03215v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03215v1-abstract-full" style="display: none;"> Autonomous agents powered by large language models (LLMs) have attracted significant research interest. However, the open-source community faces many challenges in developing specialized models for agent tasks, driven by the scarcity of high-quality agent datasets and the absence of standard protocols in this area. We introduce and publicly release xLAM, a series of large action models designed for AI agent tasks. The xLAM series includes five models with both dense and mixture-of-expert architectures, ranging from 1B to 8x22B parameters, trained using a scalable, flexible pipeline that unifies, augments, and synthesizes diverse datasets to enhance AI agents' generalizability and performance across varied environments. Our experimental results demonstrate that xLAM consistently delivers exceptional performance across multiple agent ability benchmarks, notably securing the 1st position on the Berkeley Function-Calling Leaderboard, outperforming GPT-4, Claude-3, and many other models in terms of tool use. By releasing the xLAM series, we aim to advance the performance of open-source LLMs for autonomous AI agents, potentially accelerating progress and democratizing access to high-performance models for agent tasks. Models are available at https://huggingface.co/collections/Salesforce/xlam-models-65f00e2a0a63bbcd1c2dade4 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03215v1-abstract-full').style.display = 'none'; document.getElementById('2409.03215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report for the Salesforce xLAM model series</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16365">arXiv:2408.16365</a> <span> [<a href="https://arxiv.org/pdf/2408.16365">pdf</a>, <a href="https://arxiv.org/ps/2408.16365">ps</a>, <a href="https://arxiv.org/format/2408.16365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Protograph-Based Batched Network Codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Ming Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chunming Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16365v1-abstract-short" style="display: inline;"> Batched network codes (BNCs) are a low-complexity solution for communication through networks with packet loss. Although their belief propagation (BP) performance is proved to approach capacity in the asymptotic regime, there is no evidence indicating that their BP performance is as good as expected in the finite-length regime. In this paper, we propose a protograph-based construction for BNCs, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16365v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16365v1-abstract-full" style="display: none;"> Batched network codes (BNCs) are a low-complexity solution for communication through networks with packet loss. Although their belief propagation (BP) performance is proved to approach capacity in the asymptotic regime, there is no evidence indicating that their BP performance is as good as expected in the finite-length regime. In this paper, we propose a protograph-based construction for BNCs, referred to as protograph-based BNCs (P-BNCs), which significantly differs from existing BNCs in three aspects: 1) Unlike traditional constructions where the degree of variable nodes is random, P-BNCs have a highly structured Tanner graph with specified degree distributions for both variable nodes and check nodes. 2) Traditional BNCs use a fixed degree distribution to generate all batches, making their performance highly sensitive to channel conditions, but P-BNCs achieve good performance under varying channel conditions due to their rate-compatible structures. 3) The construction of PBNCs takes into account joint BP decoding with a sparse precode, whereas traditional constructions typically do not consider a precode, or assume the presence of a precode that can recover a certain fraction of erasures. Thanks to these three improvements, P-BNCs not only have higher achievable rates under varying channel conditions, but more importantly, their finite-length BP performance is significantly improved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16365v1-abstract-full').style.display = 'none'; document.getElementById('2408.16365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14135">arXiv:2408.14135</a> <span> [<a href="https://arxiv.org/pdf/2408.14135">pdf</a>, <a href="https://arxiv.org/format/2408.14135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Foodfusion: A Novel Approach for Food Image Composition via Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chaohua Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Si Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xule Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingrui Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14135v2-abstract-short" style="display: inline;"> Food image composition requires the use of existing dish images and background images to synthesize a natural new image, while diffusion models have made significant advancements in image generation, enabling the construction of end-to-end architectures that yield promising results. However, existing diffusion models face challenges in processing and fusing information from multiple images and lac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14135v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14135v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14135v2-abstract-full" style="display: none;"> Food image composition requires the use of existing dish images and background images to synthesize a natural new image, while diffusion models have made significant advancements in image generation, enabling the construction of end-to-end architectures that yield promising results. However, existing diffusion models face challenges in processing and fusing information from multiple images and lack access to high-quality publicly available datasets, which prevents the application of diffusion models in food image composition. In this paper, we introduce a large-scale, high-quality food image composite dataset, FC22k, which comprises 22,000 foreground, background, and ground truth ternary image pairs. Additionally, we propose a novel food image composition method, Foodfusion, which leverages the capabilities of the pre-trained diffusion models and incorporates a Fusion Module for processing and integrating foreground and background information. This fused information aligns the foreground features with the background structure by merging the global structural information at the cross-attention layer of the denoising UNet. To further enhance the content and structure of the background, we also integrate a Content-Structure Control Module. Extensive experiments demonstrate the effectiveness and scalability of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14135v2-abstract-full').style.display = 'none'; document.getElementById('2408.14135v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13484">arXiv:2408.13484</a> <span> [<a href="https://arxiv.org/pdf/2408.13484">pdf</a>, <a href="https://arxiv.org/format/2408.13484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> IntOPE: Off-Policy Evaluation in the Presence of Interference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuqi Bai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minqin Zhu</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13484v1-abstract-short" style="display: inline;"> Off-Policy Evaluation (OPE) is employed to assess the potential impact of a hypothetical policy using logged contextual bandit feedback, which is crucial in areas such as personalized medicine and recommender systems, where online interactions are associated with significant risks and costs. Traditionally, OPE methods rely on the Stable Unit Treatment Value Assumption (SUTVA), which assumes that t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13484v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13484v1-abstract-full" style="display: none;"> Off-Policy Evaluation (OPE) is employed to assess the potential impact of a hypothetical policy using logged contextual bandit feedback, which is crucial in areas such as personalized medicine and recommender systems, where online interactions are associated with significant risks and costs. Traditionally, OPE methods rely on the Stable Unit Treatment Value Assumption (SUTVA), which assumes that the reward for any given individual is unaffected by the actions of others. However, this assumption often fails in real-world scenarios due to the presence of interference, where an individual's reward is affected not just by their own actions but also by the actions of their peers. This realization reveals significant limitations of existing OPE methods in real-world applications. To address this limitation, we propose IntIPW, an IPW-style estimator that extends the Inverse Probability Weighting (IPW) framework by integrating marginalized importance weights to account for both individual actions and the influence of adjacent entities. Extensive experiments are conducted on both synthetic and real-world data to demonstrate the effectiveness of the proposed IntIPW method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13484v1-abstract-full').style.display = 'none'; document.getElementById('2408.13484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository