CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 957 results for author: <span class="mathjax">Sun, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Sun%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Sun, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Sun%2C+L&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Sun, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18484">arXiv:2411.18484</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18484">pdf</a>, <a href="https://arxiv.org/format/2411.18484">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SPTTE: A Spatiotemporal Probabilistic Framework for Travel Time Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lijun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18484v1-abstract-short" style="display: inline;"> Accurate travel time estimation is essential for navigation and itinerary planning. While existing research employs probabilistic modeling to assess travel time uncertainty and account for correlations between multiple trips, modeling the temporal variability of multi-trip travel time distributions remains a significant challenge. Capturing the evolution of joint distributions requires large, well&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18484v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18484v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18484v1-abstract-full" style="display: none;"> Accurate travel time estimation is essential for navigation and itinerary planning. While existing research employs probabilistic modeling to assess travel time uncertainty and account for correlations between multiple trips, modeling the temporal variability of multi-trip travel time distributions remains a significant challenge. Capturing the evolution of joint distributions requires large, well-organized datasets; however, real-world trip data are often temporally sparse and spatially unevenly distributed. To address this issue, we propose SPTTE, a spatiotemporal probabilistic framework that models the evolving joint distribution of multi-trip travel times by formulating the estimation task as a spatiotemporal stochastic process regression problem with fragmented observations. SPTTE incorporates an RNN-based temporal Gaussian process parameterization to regularize sparse observations and capture temporal dependencies. Additionally, it employs a prior-based heterogeneity smoothing strategy to correct unreliable learning caused by unevenly distributed trips, effectively modeling temporal variability under sparse and uneven data distributions. Evaluations on real-world datasets demonstrate that SPTTE outperforms state-of-the-art deterministic and probabilistic methods by over 10.13%. Ablation studies and visualizations further confirm the effectiveness of the model components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18484v1-abstract-full').style.display = 'none'; document.getElementById('2411.18484v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17218">arXiv:2411.17218</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17218">pdf</a>, <a href="https://arxiv.org/format/2411.17218">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GraphSubDetector: Time Series Subsequence Anomaly Detection via Density-Aware Adaptive Graph Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Weiqi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhiqiang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Liang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17218v1-abstract-short" style="display: inline;"> Time series subsequence anomaly detection is an important task in a large variety of real-world applications ranging from health monitoring to AIOps, and is challenging due to the following reasons: 1) how to effectively learn complex dynamics and dependencies in time series; 2) diverse and complicated anomalous subsequences as well as the inherent variance and noise of normal patterns; 3) how to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17218v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17218v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17218v1-abstract-full" style="display: none;"> Time series subsequence anomaly detection is an important task in a large variety of real-world applications ranging from health monitoring to AIOps, and is challenging due to the following reasons: 1) how to effectively learn complex dynamics and dependencies in time series; 2) diverse and complicated anomalous subsequences as well as the inherent variance and noise of normal patterns; 3) how to determine the proper subsequence length for effective detection, which is a required parameter for many existing algorithms. In this paper, we present a novel approach to subsequence anomaly detection, namely GraphSubDetector. First, it adaptively learns the appropriate subsequence length with a length selection mechanism that highlights the characteristics of both normal and anomalous patterns. Second, we propose a density-aware adaptive graph neural network (DAGNN), which can generate further robust representations against variance of normal data for anomaly detection by message passing between subsequences. The experimental results demonstrate the effectiveness of the proposed algorithm, which achieves superior performance on multiple time series anomaly benchmark datasets compared to state-of-the-art algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17218v1-abstract-full').style.display = 'none'; document.getElementById('2411.17218v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16728">arXiv:2411.16728</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16728">pdf</a>, <a href="https://arxiv.org/format/2411.16728">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Maximizing the Impact of Deep Learning on Subseasonal-to-Seasonal Climate Forecasting: The Essential Role of Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yizhen Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+W">Wanyi Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Liang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+R">Rong Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16728v1-abstract-short" style="display: inline;"> Weather and climate forecasting is vital for sectors such as agriculture and disaster management. Although numerical weather prediction (NWP) systems have advanced, forecasting at the subseasonal-to-seasonal (S2S) scale, spanning 2 to 6 weeks, remains challenging due to the chaotic and sparse atmospheric signals at this interval. Even state-of-the-art deep learning models struggle to outperform si&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16728v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16728v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16728v1-abstract-full" style="display: none;"> Weather and climate forecasting is vital for sectors such as agriculture and disaster management. Although numerical weather prediction (NWP) systems have advanced, forecasting at the subseasonal-to-seasonal (S2S) scale, spanning 2 to 6 weeks, remains challenging due to the chaotic and sparse atmospheric signals at this interval. Even state-of-the-art deep learning models struggle to outperform simple climatology models in this domain. This paper identifies that optimization, instead of network structure, could be the root cause of this performance gap, and then we develop a novel multi-stage optimization strategy to close the gap. Extensive empirical studies demonstrate that our multi-stage optimization approach significantly improves key skill metrics, PCC and TCC, while utilizing the same backbone structure, surpassing the state-of-the-art NWP systems (ECMWF-S2S) by over \textbf{19-91\%}. Our research contests the recent study that direct forecasting outperforms rolling forecasting for S2S tasks. Through theoretical analysis, we propose that the underperformance of rolling forecasting may arise from the accumulation of Jacobian matrix products during training. Our multi-stage framework can be viewed as a form of teacher forcing to address this issue. Code is available at \url{https://anonymous.4open.science/r/Baguan-S2S-23E7/} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16728v1-abstract-full').style.display = 'none'; document.getElementById('2411.16728v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16313">arXiv:2411.16313</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16313">pdf</a>, <a href="https://arxiv.org/format/2411.16313">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CATP-LLM: Empowering Large Language Models for Cost-Aware Tool Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+D">Duo Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jinghe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Y">Yuan Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanning Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16313v1-abstract-short" style="display: inline;"> Utilizing large language models (LLMs) for tool planning has emerged as a promising avenue for developing general AI systems, where LLMs automatically schedule external tools (e.g. vision models) to tackle complex tasks based on task descriptions. To push this paradigm toward practical applications, it is crucial for LLMs to consider tool execution costs (e.g. execution time) for tool planning. Un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16313v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16313v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16313v1-abstract-full" style="display: none;"> Utilizing large language models (LLMs) for tool planning has emerged as a promising avenue for developing general AI systems, where LLMs automatically schedule external tools (e.g. vision models) to tackle complex tasks based on task descriptions. To push this paradigm toward practical applications, it is crucial for LLMs to consider tool execution costs (e.g. execution time) for tool planning. Unfortunately, prior studies overlook the tool execution costs, leading to the generation of expensive plans of which the costs outweigh task performance. To fill this gap, we propose the Cost-Aware Tool Planning with LLMs (CATP-LLM) framework, which for the first time provides a coherent design to empower LLMs for cost-aware tool planning. Specifically, CATP-LLM incorporates a tool planning language to enhance the LLM to generate non-sequential plans of multiple branches for efficient concurrent tool execution and cost reduction. Moreover, it further designs a cost-aware offline reinforcement learning algorithm to fine-tune the LLM to optimize the performance-cost trade-off in tool planning. In lack of public cost-related datasets, we further present OpenCATP, the first platform for cost-aware planning evaluation. Experiments on OpenCATP show that CATP-LLM outperforms GPT-4 even when using Llama2-7B as its backbone, with the average improvement of 28.2%-30.2% higher plan performance and 24.7%-45.8% lower costs even on the challenging planning tasks. The codes of CATP-LLM and OpenCATP will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16313v1-abstract-full').style.display = 'none'; document.getElementById('2411.16313v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14720">arXiv:2411.14720</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14720">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Social Media Annotation of HPV Vaccine Skepticism and Misinformation Using Large Language Models: An Experimental Evaluation of In-Context Learning and Fine-Tuning Stance Detection Across Multiple Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Luhang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Pendyala%2C+V">Varsha Pendyala</a>, <a href="/search/cs?searchtype=author&amp;query=Chuang%2C+Y">Yun-Shiuan Chuang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shanglin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Feldman%2C+J">Jonathan Feldman</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+A">Andrew Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=De+Choudhury%2C+M">Munmun De Choudhury</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Sijia Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Shah%2C+D">Dhavan Shah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14720v1-abstract-short" style="display: inline;"> This paper leverages large-language models (LLMs) to experimentally determine optimal strategies for scaling up social media content annotation for stance detection on HPV vaccine-related tweets. We examine both conventional fine-tuning and emergent in-context learning methods, systematically varying strategies of prompt engineering across widely used LLMs and their variants (e.g., GPT4, Mistral,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14720v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14720v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14720v1-abstract-full" style="display: none;"> This paper leverages large-language models (LLMs) to experimentally determine optimal strategies for scaling up social media content annotation for stance detection on HPV vaccine-related tweets. We examine both conventional fine-tuning and emergent in-context learning methods, systematically varying strategies of prompt engineering across widely used LLMs and their variants (e.g., GPT4, Mistral, and Llama3, etc.). Specifically, we varied prompt template design, shot sampling methods, and shot quantity to detect stance on HPV vaccination. Our findings reveal that 1) in general, in-context learning outperforms fine-tuning in stance detection for HPV vaccine social media content; 2) increasing shot quantity does not necessarily enhance performance across models; and 3) different LLMs and their variants present differing sensitivity to in-context learning conditions. We uncovered that the optimal in-context learning configuration for stance detection on HPV vaccine tweets involves six stratified shots paired with detailed contextual prompts. This study highlights the potential and provides an applicable approach for applying LLMs to research on social media stance and skepticism detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14720v1-abstract-full').style.display = 'none'; document.getElementById('2411.14720v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13836">arXiv:2411.13836</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13836">pdf</a>, <a href="https://arxiv.org/format/2411.13836">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIPer: Hierarchically Improving Spatial Representation of CLIP for Open-Vocabulary Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lin Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jiale Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Jin Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xiaoheng Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+Y">Yanwei Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13836v1-abstract-short" style="display: inline;"> Contrastive Language-Image Pre-training (CLIP) exhibits strong zero-shot classification ability on various image-level tasks, leading to the research to adapt CLIP for pixel-level open-vocabulary semantic segmentation without additional training. The key is to improve spatial representation of image-level CLIP, such as replacing self-attention map at last layer with self-self attention map or visi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13836v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13836v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13836v1-abstract-full" style="display: none;"> Contrastive Language-Image Pre-training (CLIP) exhibits strong zero-shot classification ability on various image-level tasks, leading to the research to adapt CLIP for pixel-level open-vocabulary semantic segmentation without additional training. The key is to improve spatial representation of image-level CLIP, such as replacing self-attention map at last layer with self-self attention map or vision foundation model based attention map. In this paper, we present a novel hierarchical framework, named CLIPer, that hierarchically improves spatial representation of CLIP. The proposed CLIPer includes an early-layer fusion module and a fine-grained compensation module. We observe that, the embeddings and attention maps at early layers can preserve spatial structural information. Inspired by this, we design the early-layer fusion module to generate segmentation map with better spatial coherence. Afterwards, we employ a fine-grained compensation module to compensate the local details using the self-attention maps of diffusion model. We conduct the experiments on seven segmentation datasets. Our proposed CLIPer achieves the state-of-the-art performance on these datasets. For instance, using ViT-L, CLIPer has the mIoU of 69.8% and 43.3% on VOC and COCO Object, outperforming ProxyCLIP by 9.2% and 4.1% respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13836v1-abstract-full').style.display = 'none'; document.getElementById('2411.13836v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Homepange and code: https://linsun449.github.io/cliper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12591">arXiv:2411.12591</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12591">pdf</a>, <a href="https://arxiv.org/format/2411.12591">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Thinking Before Looking: Improving Multimodal LLM Reasoning via Mitigating Visual Hallucination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Haojie Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tianyang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hanchi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Pu%2C+S">Shu Pu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Ruoxi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12591v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have advanced the integration of visual and linguistic modalities, establishing themselves as the dominant paradigm for visual-language tasks. Current approaches like chain of thought (CoT) reasoning have augmented the cognitive capabilities of large language models (LLMs), yet their adaptation to MLLMs is hindered by heightened risks of hallucination in cr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12591v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12591v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12591v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have advanced the integration of visual and linguistic modalities, establishing themselves as the dominant paradigm for visual-language tasks. Current approaches like chain of thought (CoT) reasoning have augmented the cognitive capabilities of large language models (LLMs), yet their adaptation to MLLMs is hindered by heightened risks of hallucination in cross-modality comprehension. In this paper, we find that the thinking while looking paradigm in current multimodal CoT approaches--where reasoning chains are generated alongside visual input--fails to mitigate hallucinations caused by misleading images. To address these limitations, we propose the Visual Inference Chain (VIC) framework, a novel approach that constructs reasoning chains using textual context alone before introducing visual input, effectively reducing cross-modal biases and enhancing multimodal reasoning accuracy. Comprehensive evaluations demonstrate that VIC significantly improves zero-shot performance across various vision-related tasks, mitigating hallucinations while refining the reasoning capabilities of MLLMs. Our code repository can be found at https://github.com/Terry-Xu-666/visual_inference_chain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12591v1-abstract-full').style.display = 'none'; document.getElementById('2411.12591v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11916">arXiv:2411.11916</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11916">pdf</a>, <a href="https://arxiv.org/format/2411.11916">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> From Words to Structured Visuals: A Benchmark and Framework for Text-to-Diagram Generation and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jingxuan Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+G">Gaowei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Linzhuang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bihui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+R">Ruifeng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11916v1-abstract-short" style="display: inline;"> We introduce the task of text-to-diagram generation, which focuses on creating structured visual representations directly from textual descriptions. Existing approaches in text-to-image and text-to-code generation lack the logical organization and flexibility needed to produce accurate, editable diagrams, often resulting in outputs that are either unstructured or difficult to modify. To address th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11916v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11916v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11916v1-abstract-full" style="display: none;"> We introduce the task of text-to-diagram generation, which focuses on creating structured visual representations directly from textual descriptions. Existing approaches in text-to-image and text-to-code generation lack the logical organization and flexibility needed to produce accurate, editable diagrams, often resulting in outputs that are either unstructured or difficult to modify. To address this gap, we introduce DiagramGenBenchmark, a comprehensive evaluation framework encompassing eight distinct diagram categories, including flowcharts, model architecture diagrams, and mind maps. Additionally, we present DiagramAgent, an innovative framework with four core modules-Plan Agent, Code Agent, Check Agent, and Diagram-to-Code Agent-designed to facilitate both the generation and refinement of complex diagrams. Our extensive experiments, which combine objective metrics with human evaluations, demonstrate that DiagramAgent significantly outperforms existing baseline models in terms of accuracy, structural coherence, and modifiability. This work not only establishes a foundational benchmark for the text-to-diagram generation task but also introduces a powerful toolset to advance research and applications in this emerging area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11916v1-abstract-full').style.display = 'none'; document.getElementById('2411.11916v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11504">arXiv:2411.11504</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11504">pdf</a>, <a href="https://arxiv.org/format/2411.11504">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Search, Verify and Feedback: Towards Next Generation Post-training Paradigm of Foundation Models via Verifier Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guan%2C+X">Xinyan Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yanjiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Lou%2C+J">Jie Lou</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11504v1-abstract-short" style="display: inline;"> The evolution of machine learning has increasingly prioritized the development of powerful models and more scalable supervision signals. However, the emergence of foundation models presents significant challenges in providing effective supervision signals necessary for further enhancing their capabilities. Consequently, there is an urgent need to explore novel supervision signals and technical app&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11504v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11504v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11504v1-abstract-full" style="display: none;"> The evolution of machine learning has increasingly prioritized the development of powerful models and more scalable supervision signals. However, the emergence of foundation models presents significant challenges in providing effective supervision signals necessary for further enhancing their capabilities. Consequently, there is an urgent need to explore novel supervision signals and technical approaches. In this paper, we propose verifier engineering, a novel post-training paradigm specifically designed for the era of foundation models. The core of verifier engineering involves leveraging a suite of automated verifiers to perform verification tasks and deliver meaningful feedback to foundation models. We systematically categorize the verifier engineering process into three essential stages: search, verify, and feedback, and provide a comprehensive review of state-of-the-art research developments within each stage. We believe that verifier engineering constitutes a fundamental pathway toward achieving Artificial General Intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11504v1-abstract-full').style.display = 'none'; document.getElementById('2411.11504v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10440">arXiv:2411.10440</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10440">pdf</a>, <a href="https://arxiv.org/format/2411.10440">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-CoT: Let Vision Language Models Reason Step-by-Step </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guowei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+P">Peng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yibing Song</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10440v2-abstract-short" style="display: inline;"> Large language models have demonstrated substantial advancements in reasoning capabilities, particularly through inference-time scaling, as illustrated by models such as OpenAI&#39;s o1. However, current Vision-Language Models (VLMs) often struggle to perform systematic and structured reasoning, especially when handling complex visual question-answering tasks. In this work, we introduce LLaVA-CoT, a n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10440v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10440v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10440v2-abstract-full" style="display: none;"> Large language models have demonstrated substantial advancements in reasoning capabilities, particularly through inference-time scaling, as illustrated by models such as OpenAI&#39;s o1. However, current Vision-Language Models (VLMs) often struggle to perform systematic and structured reasoning, especially when handling complex visual question-answering tasks. In this work, we introduce LLaVA-CoT, a novel VLM designed to conduct autonomous multistage reasoning. Unlike chain-of-thought prompting, LLaVA-CoT independently engages in sequential stages of summarization, visual interpretation, logical reasoning, and conclusion generation. This structured approach enables LLaVA-CoT to achieve marked improvements in precision on reasoning-intensive tasks. To accomplish this, we compile the LLaVA-CoT-100k dataset, integrating samples from various visual question answering sources and providing structured reasoning annotations. Besides, we propose an inference-time stage-level beam search method, which enables effective inference-time scaling. Remarkably, with only 100k training samples and a simple yet effective inference time scaling method, LLaVA-CoT not only outperforms its base model by 8.9% on a wide range of multimodal reasoning benchmarks, but also surpasses the performance of larger and even closed-source models, such as Gemini-1.5-pro, GPT-4o-mini, and Llama-3.2-90B-Vision-Instruct. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10440v2-abstract-full').style.display = 'none'; document.getElementById('2411.10440v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07828">arXiv:2411.07828</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07828">pdf</a>, <a href="https://arxiv.org/ps/2411.07828">ps</a>, <a href="https://arxiv.org/format/2411.07828">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Suite-IN: Aggregating Motion Features from Apple Suite for Robust Inertial Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+S">Songpengcheng Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Junyuan Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiarui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+Z">Zengyuan Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07828v1-abstract-short" style="display: inline;"> With the rapid development of wearable technology, devices like smartphones, smartwatches, and headphones equipped with IMUs have become essential for applications such as pedestrian positioning. However, traditional pedestrian dead reckoning (PDR) methods struggle with diverse motion patterns, while recent data-driven approaches, though improving accuracy, often lack robustness due to reliance on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07828v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07828v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07828v1-abstract-full" style="display: none;"> With the rapid development of wearable technology, devices like smartphones, smartwatches, and headphones equipped with IMUs have become essential for applications such as pedestrian positioning. However, traditional pedestrian dead reckoning (PDR) methods struggle with diverse motion patterns, while recent data-driven approaches, though improving accuracy, often lack robustness due to reliance on a single device.In our work, we attempt to enhance the positioning performance using the low-cost commodity IMUs embedded in the wearable devices. We propose a multi-device deep learning framework named Suite-IN, aggregating motion data from Apple Suite for inertial navigation. Motion data captured by sensors on different body parts contains both local and global motion information, making it essential to reduce the negative effects of localized movements and extract global motion representations from multiple devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07828v1-abstract-full').style.display = 'none'; document.getElementById('2411.07828v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05289">arXiv:2411.05289</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05289">pdf</a>, <a href="https://arxiv.org/format/2411.05289">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecHub: Provable Acceleration to Multi-Draft Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+R">Ryan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05289v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have become essential in advancing natural language processing (NLP) tasks, but their sequential token generation limits inference speed. Multi-Draft Speculative Decoding (MDSD) offers a promising solution by using a smaller draft model to generate multiple token sequences, which the target LLM verifies in parallel. However, current heuristic approaches, such as Recurs&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05289v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05289v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05289v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have become essential in advancing natural language processing (NLP) tasks, but their sequential token generation limits inference speed. Multi-Draft Speculative Decoding (MDSD) offers a promising solution by using a smaller draft model to generate multiple token sequences, which the target LLM verifies in parallel. However, current heuristic approaches, such as Recursive Rejection Sampling (RRS), suffer from low acceptance rates in subsequent drafts, limiting the advantages of using multiple drafts. Meanwhile, Optimal Transport with Membership Cost (OTM) can theoretically improve acceptance rates, but its computational cost is too high for real-time use. We present SpecHub, a novel, efficient sampling-verification method for MDSD that improves acceptance rates with only linear computational overhead. By simplifying the OTM problem into a compact Linear Programming model, SpecHub significantly reduces computational complexity. It further accelerates sampling by leveraging a sparse joint distribution, focusing computation on high-probability token sequences. In extensive experiments, Spechub consistently generates 0.05-0.27 and 0.02-0.16 more tokens per step than RRS and RRS without replacement. We attach our code at \url{https://github.com/MasterGodzilla/Speculative_decoding_OT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05289v1-abstract-full').style.display = 'none'; document.getElementById('2411.05289v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 (Main)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04714">arXiv:2411.04714</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04714">pdf</a>, <a href="https://arxiv.org/format/2411.04714">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Disparity from Dual-Pixel Images: Physics-Informed Lightweight Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kurita%2C+T">Teppei Kurita</a>, <a href="/search/cs?searchtype=author&amp;query=Kondo%2C+Y">Yuhi Kondo</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Legong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Sasaki%2C+T">Takayuki Sasaki</a>, <a href="/search/cs?searchtype=author&amp;query=Nitta%2C+S">Sho Nitta</a>, <a href="/search/cs?searchtype=author&amp;query=Hashimoto%2C+Y">Yasuhiro Hashimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Muramatsu%2C+Y">Yoshinori Muramatsu</a>, <a href="/search/cs?searchtype=author&amp;query=Moriuchi%2C+Y">Yusuke Moriuchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04714v1-abstract-short" style="display: inline;"> In this study, we propose a high-performance disparity (depth) estimation method using dual-pixel (DP) images with few parameters. Conventional end-to-end deep-learning methods have many parameters but do not fully exploit disparity constraints, which limits their performance. Therefore, we propose a lightweight disparity estimation method based on a completion-based network that explicitly constr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04714v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04714v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04714v1-abstract-full" style="display: none;"> In this study, we propose a high-performance disparity (depth) estimation method using dual-pixel (DP) images with few parameters. Conventional end-to-end deep-learning methods have many parameters but do not fully exploit disparity constraints, which limits their performance. Therefore, we propose a lightweight disparity estimation method based on a completion-based network that explicitly constrains disparity and learns the physical and systemic disparity properties of DP. By modeling the DP-specific disparity error parametrically and using it for sampling during training, the network acquires the unique properties of DP and enhances robustness. This learning also allows us to use a common RGB-D dataset for training without a DP dataset, which is labor-intensive to acquire. Furthermore, we propose a non-learning-based refinement framework that efficiently handles inherent disparity expansion errors by appropriately refining the confidence map of the network output. As a result, the proposed method achieved state-of-the-art results while reducing the overall system size to 1/5 of that of the conventional method, even without using the DP dataset for training, thereby demonstrating its effectiveness. The code and dataset are available on our project site. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04714v1-abstract-full').style.display = 'none'; document.getElementById('2411.04714v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Winter Conference on Applications of Computer Vision (WACV) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03823">arXiv:2411.03823</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03823">pdf</a>, <a href="https://arxiv.org/format/2411.03823">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM Data Contamination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+D">Dingjie Song</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+S">Sicheng Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shunian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Benyou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03823v1-abstract-short" style="display: inline;"> The rapid progression of multimodal large language models (MLLMs) has demonstrated superior performance on various multimodal benchmarks. However, the issue of data contamination during training creates challenges in performance evaluation and comparison. While numerous methods exist for detecting dataset contamination in large language models (LLMs), they are less effective for MLLMs due to their&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03823v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03823v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03823v1-abstract-full" style="display: none;"> The rapid progression of multimodal large language models (MLLMs) has demonstrated superior performance on various multimodal benchmarks. However, the issue of data contamination during training creates challenges in performance evaluation and comparison. While numerous methods exist for detecting dataset contamination in large language models (LLMs), they are less effective for MLLMs due to their various modalities and multiple training phases. In this study, we introduce a multimodal data contamination detection framework, MM-Detect, designed for MLLMs. Our experimental results indicate that MM-Detect is sensitive to varying degrees of contamination and can highlight significant performance improvements due to leakage of the training set of multimodal benchmarks. Furthermore, We also explore the possibility of contamination originating from the pre-training phase of LLMs used by MLLMs and the fine-tuning phase of MLLMs, offering new insights into the stages at which contamination may be introduced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03823v1-abstract-full').style.display = 'none'; document.getElementById('2411.03823v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03669">arXiv:2411.03669</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03669">pdf</a>, <a href="https://arxiv.org/format/2411.03669">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Imagined Potential Games: A Framework for Simulating, Learning and Evaluating Interactive Behaviors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lingfeng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yixiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hung%2C+P">Pin-Yun Hung</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Changhao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhuo Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Tomizuka%2C+M">Masayoshi Tomizuka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03669v1-abstract-short" style="display: inline;"> Interacting with human agents in complex scenarios presents a significant challenge for robotic navigation, particularly in environments that necessitate both collision avoidance and collaborative interaction, such as indoor spaces. Unlike static or predictably moving obstacles, human behavior is inherently complex and unpredictable, stemming from dynamic interactions with other agents. Existing s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03669v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03669v1-abstract-full" style="display: none;"> Interacting with human agents in complex scenarios presents a significant challenge for robotic navigation, particularly in environments that necessitate both collision avoidance and collaborative interaction, such as indoor spaces. Unlike static or predictably moving obstacles, human behavior is inherently complex and unpredictable, stemming from dynamic interactions with other agents. Existing simulation tools frequently fail to adequately model such reactive and collaborative behaviors, impeding the development and evaluation of robust social navigation strategies. This paper introduces a novel framework utilizing distributed potential games to simulate human-like interactions in highly interactive scenarios. Within this framework, each agent imagines a virtual cooperative game with others based on its estimation. We demonstrate this formulation can facilitate the generation of diverse and realistic interaction patterns in a configurable manner across various scenarios. Additionally, we have developed a gym-like environment leveraging our interactive agent model to facilitate the learning and evaluation of interactive navigation algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03669v1-abstract-full').style.display = 'none'; document.getElementById('2411.03669v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures. arXiv admin note: substantial text overlap with arXiv:2310.01614</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03554">arXiv:2411.03554</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03554">pdf</a>, <a href="https://arxiv.org/format/2411.03554">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Vision Language Model Unlearning via Fictitious Facial Identity Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yingzi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiongxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+S">Siyuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiazhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiujun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Furong Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+M">Muhao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+C">Chaowei Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03554v2-abstract-short" style="display: inline;"> Machine unlearning has emerged as an effective strategy for forgetting specific information in the training data. However, with the increasing integration of visual data, privacy concerns in Vision Language Models (VLMs) remain underexplored. To address this, we introduce Facial Identity Unlearning Benchmark (FIUBench), a novel VLM unlearning benchmark designed to robustly evaluate the effectivene&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03554v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03554v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03554v2-abstract-full" style="display: none;"> Machine unlearning has emerged as an effective strategy for forgetting specific information in the training data. However, with the increasing integration of visual data, privacy concerns in Vision Language Models (VLMs) remain underexplored. To address this, we introduce Facial Identity Unlearning Benchmark (FIUBench), a novel VLM unlearning benchmark designed to robustly evaluate the effectiveness of unlearning algorithms under the Right to be Forgotten setting. Specifically, we formulate the VLM unlearning task via constructing the Fictitious Facial Identity VQA dataset and apply a two-stage evaluation pipeline that is designed to precisely control the sources of information and their exposure levels. In terms of evaluation, since VLM supports various forms of ways to ask questions with the same semantic meaning, we also provide robust evaluation metrics including membership inference attacks and carefully designed adversarial privacy attacks to evaluate the performance of algorithms. Through the evaluation of four baseline VLM unlearning algorithms within FIUBench, we find that all methods remain limited in their unlearning performance, with significant trade-offs between model utility and forget quality. Furthermore, our findings also highlight the importance of privacy attacks for robust evaluations. We hope FIUBench will drive progress in developing more effective VLM unlearning algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03554v2-abstract-full').style.display = 'none'; document.getElementById('2411.03554v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03250">arXiv:2411.03250</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03250">pdf</a>, <a href="https://arxiv.org/format/2411.03250">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DiffLM: Controllable Synthetic Data Generation via Diffusion Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Ying Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinyao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+Y">Yulei Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yaojie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+L">Lexin Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+F">Fan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Longyin Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03250v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have significantly enhanced their knowledge and generative capabilities, leading to a surge of interest in leveraging LLMs for high-quality data synthesis. However, synthetic data generation via prompting LLMs remains challenging due to LLMs&#39; limited understanding of target data distributions and the complexity of prompt engineering, especially f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03250v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03250v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03250v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have significantly enhanced their knowledge and generative capabilities, leading to a surge of interest in leveraging LLMs for high-quality data synthesis. However, synthetic data generation via prompting LLMs remains challenging due to LLMs&#39; limited understanding of target data distributions and the complexity of prompt engineering, especially for structured formatted data. To address these issues, we introduce DiffLM, a controllable data synthesis framework based on variational autoencoder (VAE), which further (1) leverages diffusion models to reserve more information of original distribution and format structure in the learned latent distribution and (2) decouples the learning of target distribution knowledge from the LLM&#39;s generative objectives via a plug-and-play latent feature injection module. As we observed significant discrepancies between the VAE&#39;s latent representations and the real data distribution, the latent diffusion module is introduced into our framework to learn a fully expressive latent distribution. Evaluations on seven real-world datasets with structured formatted data (i.e., Tabular, Code and Tool data) demonstrate that DiffLM generates high-quality data, with performance on downstream tasks surpassing that of real data by 2-7 percent in certain cases. The data and code will be publicly available upon completion of internal review. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03250v1-abstract-full').style.display = 'none'; document.getElementById('2411.03250v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03082">arXiv:2411.03082</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03082">pdf</a>, <a href="https://arxiv.org/format/2411.03082">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised cross-modality learning for uncertainty-aware object detection and recognition in applications which lack pre-labelled training data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mehboob%2C+I">Irum Mehboob</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Astegarpanah%2C+A">Alireza Astegarpanah</a>, <a href="/search/cs?searchtype=author&amp;query=Stolkin%2C+R">Rustam Stolkin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03082v1-abstract-short" style="display: inline;"> This paper shows how an uncertainty-aware, deep neural network can be trained to detect, recognise and localise objects in 2D RGB images, in applications lacking annotated train-ng datasets. We propose a self-supervising teacher-student pipeline, in which a relatively simple teacher classifier, trained with only a few labelled 2D thumbnails, automatically processes a larger body of unlabelled RGB-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03082v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03082v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03082v1-abstract-full" style="display: none;"> This paper shows how an uncertainty-aware, deep neural network can be trained to detect, recognise and localise objects in 2D RGB images, in applications lacking annotated train-ng datasets. We propose a self-supervising teacher-student pipeline, in which a relatively simple teacher classifier, trained with only a few labelled 2D thumbnails, automatically processes a larger body of unlabelled RGB-D data to teach a student network based on a modified YOLOv3 architecture. Firstly, 3D object detection with back projection is used to automatically extract and teach 2D detection and localisation information to the student network. Secondly, a weakly supervised 2D thumbnail classifier, with minimal training on a small number of hand-labelled images, is used to teach object category recognition. Thirdly, we use a Gaussian Process GP to encode and teach a robust uncertainty estimation functionality, so that the student can output confidence scores with each categorization. The resulting student significantly outperforms the same YOLO architecture trained directly on the same amount of labelled data. Our GP-based approach yields robust and meaningful uncertainty estimations for complex industrial object classifications. The end-to-end network is also capable of real-time processing, needed for robotics applications. Our method can be applied to many important industrial tasks, where labelled datasets are typically unavailable. In this paper, we demonstrate an example of detection, localisation, and object category recognition of nuclear mixed-waste materials in highly cluttered and unstructured scenes. This is critical for robotic sorting and handling of legacy nuclear waste, which poses complex environmental remediation challenges in many nuclearised nations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03082v1-abstract-full').style.display = 'none'; document.getElementById('2411.03082v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23426">arXiv:2410.23426</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23426">pdf</a>, <a href="https://arxiv.org/format/2410.23426">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Social Science Meets LLMs: How Reliable Are Large Language Models in Social Simulations? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Z">Zhengqing Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yujun Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+K">Kehan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiangqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+H">Haomin Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Weixiang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Y">Yanfang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23426v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are increasingly employed for simulations, enabling applications in role-playing agents and Computational Social Science (CSS). However, the reliability of these simulations is under-explored, which raises concerns about the trustworthiness of LLMs in these applications. In this paper, we aim to answer ``How reliable is LLM-based simulation?&#39;&#39; To address this, we intro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23426v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23426v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23426v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are increasingly employed for simulations, enabling applications in role-playing agents and Computational Social Science (CSS). However, the reliability of these simulations is under-explored, which raises concerns about the trustworthiness of LLMs in these applications. In this paper, we aim to answer ``How reliable is LLM-based simulation?&#39;&#39; To address this, we introduce TrustSim, an evaluation dataset covering 10 CSS-related topics, to systematically investigate the reliability of the LLM simulation. We conducted experiments on 14 LLMs and found that inconsistencies persist in the LLM-based simulated roles. In addition, the consistency level of LLMs does not strongly correlate with their general performance. To enhance the reliability of LLMs in simulation, we proposed Adaptive Learning Rate Based ORPO (AdaORPO), a reinforcement learning-based algorithm to improve the reliability in simulation across 7 LLMs. Our research provides a foundation for future studies to explore more robust and trustworthy LLM-based simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23426v1-abstract-full').style.display = 'none'; document.getElementById('2410.23426v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21027">arXiv:2410.21027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21027">pdf</a>, <a href="https://arxiv.org/format/2410.21027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Transferable Post-training via Inverse Value Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+X">Xueru Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21027v1-abstract-short" style="display: inline;"> As post-training processes utilize increasingly large datasets and base models continue to grow in size, the computational demands and implementation challenges of existing algorithms are escalating significantly. In this paper, we propose modeling the changes at the logits level during post-training using a separate neural network (i.e., the value network). After training this network on a small&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21027v1-abstract-full" style="display: none;"> As post-training processes utilize increasingly large datasets and base models continue to grow in size, the computational demands and implementation challenges of existing algorithms are escalating significantly. In this paper, we propose modeling the changes at the logits level during post-training using a separate neural network (i.e., the value network). After training this network on a small base model using demonstrations, this network can be seamlessly integrated with other pre-trained models during inference, enables them to achieve similar capability enhancements. We systematically investigate the best practices for this paradigm in terms of pre-training weights and connection schemes. We demonstrate that the resulting value network has broad transferability across pre-trained models of different parameter sizes within the same family, models undergoing continuous pre-training within the same family, and models with different vocabularies across families. In certain cases, it can achieve performance comparable to full-parameter fine-tuning. Furthermore, we explore methods to enhance the transferability of the value model and prevent overfitting to the base model used during training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21027v1-abstract-full').style.display = 'none'; document.getElementById('2410.21027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20746">arXiv:2410.20746</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20746">pdf</a>, <a href="https://arxiv.org/format/2410.20746">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ElectionSim: Massive Population Election Simulation Powered by Large Language Model Driven Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinnong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Jiayu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Libo Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+W">Weihong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yihang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+H">Hanjia Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Mou%2C+X">Xinyi Mou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Siming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shiping Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20746v3-abstract-short" style="display: inline;"> The massive population election simulation aims to model the preferences of specific groups in particular election scenarios. It has garnered significant attention for its potential to forecast real-world social trends. Traditional agent-based modeling (ABM) methods are constrained by their ability to incorporate complex individual background information and provide interactive prediction results.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20746v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20746v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20746v3-abstract-full" style="display: none;"> The massive population election simulation aims to model the preferences of specific groups in particular election scenarios. It has garnered significant attention for its potential to forecast real-world social trends. Traditional agent-based modeling (ABM) methods are constrained by their ability to incorporate complex individual background information and provide interactive prediction results. In this paper, we introduce ElectionSim, an innovative election simulation framework based on large language models, designed to support accurate voter simulations and customized distributions, together with an interactive platform to dialogue with simulated voters. We present a million-level voter pool sampled from social media platforms to support accurate individual simulation. We also introduce PPE, a poll-based presidential election benchmark to assess the performance of our framework under the U.S. presidential election scenario. Through extensive experiments and analyses, we demonstrate the effectiveness and robustness of our framework in U.S. presidential election simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20746v3-abstract-full').style.display = 'none'; document.getElementById('2410.20746v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19989">arXiv:2410.19989</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19989">pdf</a>, <a href="https://arxiv.org/format/2410.19989">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On-Robot Reinforcement Learning with Goal-Contrastive Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Biza%2C+O">Ondrej Biza</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+T">Thomas Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lingfeng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Schmeckpeper%2C+K">Karl Schmeckpeper</a>, <a href="/search/cs?searchtype=author&amp;query=Kelestemur%2C+T">Tarik Kelestemur</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y+J">Yecheng Jason Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Platt%2C+R">Robert Platt</a>, <a href="/search/cs?searchtype=author&amp;query=van+de+Meent%2C+J">Jan-Willem van de Meent</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+L+L+S">Lawson L. S. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19989v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has the potential to enable robots to learn from their own actions in the real world. Unfortunately, RL can be prohibitively expensive, in terms of on-robot runtime, due to inefficient exploration when learning from a sparse reward signal. Designing dense reward functions is labour-intensive and requires domain expertise. In our work, we propose GCR (Goal-Contrastive Re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19989v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19989v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19989v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has the potential to enable robots to learn from their own actions in the real world. Unfortunately, RL can be prohibitively expensive, in terms of on-robot runtime, due to inefficient exploration when learning from a sparse reward signal. Designing dense reward functions is labour-intensive and requires domain expertise. In our work, we propose GCR (Goal-Contrastive Rewards), a dense reward function learning method that can be trained on passive video demonstrations. By using videos without actions, our method is easier to scale, as we can use arbitrary videos. GCR combines two loss functions, an implicit value loss function that models how the reward increases when traversing a successful trajectory, and a goal-contrastive loss that discriminates between successful and failed trajectories. We perform experiments in simulated manipulation environments across RoboMimic and MimicGen tasks, as well as in the real world using a Franka arm and a Spot quadruped. We find that GCR leads to a more-sample efficient RL, enabling model-free RL to solve about twice as many tasks as our baseline reward learning methods. We also demonstrate positive cross-embodiment transfer from videos of people and of other robots performing a task. Appendix: \url{https://tinyurl.com/gcr-appendix-2}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19989v1-abstract-full').style.display = 'none'; document.getElementById('2410.19989v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17941">arXiv:2410.17941</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17941">pdf</a>, <a href="https://arxiv.org/format/2410.17941">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Spiking Graph Neural Network on Riemannian Manifolds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhenhao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Q">Qiqi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+P+S">Philip S. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17941v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have become the dominant solution for learning on graphs, the typical non-Euclidean structures. Conventional GNNs, constructed with the Artificial Neuron Network (ANN), have achieved impressive performance at the cost of high computation and energy consumption. In parallel, spiking GNNs with brain-like spiking neurons are drawing increasing research attention owing to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17941v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17941v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17941v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have become the dominant solution for learning on graphs, the typical non-Euclidean structures. Conventional GNNs, constructed with the Artificial Neuron Network (ANN), have achieved impressive performance at the cost of high computation and energy consumption. In parallel, spiking GNNs with brain-like spiking neurons are drawing increasing research attention owing to the energy efficiency. So far, existing spiking GNNs consider graphs in Euclidean space, ignoring the structural geometry, and suffer from the high latency issue due to Back-Propagation-Through-Time (BPTT) with the surrogate gradient. In light of the aforementioned issues, we are devoted to exploring spiking GNN on Riemannian manifolds, and present a Manifold-valued Spiking GNN (MSG). In particular, we design a new spiking neuron on geodesically complete manifolds with the diffeomorphism, so that BPTT regarding the spikes is replaced by the proposed differentiation via manifold. Theoretically, we show that MSG approximates a solver of the manifold ordinary differential equation. Extensive experiments on common graphs show the proposed MSG achieves superior performance to previous spiking GNNs and energy efficiency to conventional GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17941v1-abstract-full').style.display = 'none'; document.getElementById('2410.17941v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024, 30 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17131">arXiv:2410.17131</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17131">pdf</a>, <a href="https://arxiv.org/format/2410.17131">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Aligning Large Language Models via Self-Steering Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+H">Hao Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17131v1-abstract-short" style="display: inline;"> Automated alignment develops alignment systems with minimal human intervention. The key to automated alignment lies in providing learnable and accurate preference signals for preference learning without human annotation. In this paper, we introduce Self-Steering Optimization ($SSO$), an algorithm that autonomously generates high-quality preference signals based on predefined principles during iter&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17131v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17131v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17131v1-abstract-full" style="display: none;"> Automated alignment develops alignment systems with minimal human intervention. The key to automated alignment lies in providing learnable and accurate preference signals for preference learning without human annotation. In this paper, we introduce Self-Steering Optimization ($SSO$), an algorithm that autonomously generates high-quality preference signals based on predefined principles during iterative training, eliminating the need for manual annotation. $SSO$ maintains the accuracy of signals by ensuring a consistent gap between chosen and rejected responses while keeping them both on-policy to suit the current policy model&#39;s learning capacity. $SSO$ can benefit the online and offline training of the policy model, as well as enhance the training of reward models. We validate the effectiveness of $SSO$ with two foundation models, Qwen2 and Llama3.1, indicating that it provides accurate, on-policy preference signals throughout iterative training. Without any manual annotation or external models, $SSO$ leads to significant performance improvements across six subjective or objective benchmarks. Besides, the preference data generated by $SSO$ significantly enhanced the performance of the reward model on Rewardbench. Our work presents a scalable approach to preference optimization, paving the way for more efficient and effective automated alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17131v1-abstract-full').style.display = 'none'; document.getElementById('2410.17131v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15698">arXiv:2410.15698</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15698">pdf</a>, <a href="https://arxiv.org/format/2410.15698">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Solving Continual Offline RL through Selective Weights Activation on Aligned Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Jifeng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sili Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhejian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengchao Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shisong Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hechang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15698v1-abstract-short" style="display: inline;"> Continual offline reinforcement learning (CORL) has shown impressive ability in diffusion-based lifelong learning systems by modeling the joint distributions of trajectories. However, most research only focuses on limited continual task settings where the tasks have the same observation and action space, which deviates from the realistic demands of training agents in various environments. In view&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15698v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15698v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15698v1-abstract-full" style="display: none;"> Continual offline reinforcement learning (CORL) has shown impressive ability in diffusion-based lifelong learning systems by modeling the joint distributions of trajectories. However, most research only focuses on limited continual task settings where the tasks have the same observation and action space, which deviates from the realistic demands of training agents in various environments. In view of this, we propose Vector-Quantized Continual Diffuser, named VQ-CD, to break the barrier of different spaces between various tasks. Specifically, our method contains two complementary sections, where the quantization spaces alignment provides a unified basis for the selective weights activation. In the quantized spaces alignment, we leverage vector quantization to align the different state and action spaces of various tasks, facilitating continual training in the same space. Then, we propose to leverage a unified diffusion model attached by the inverse dynamic model to master all tasks by selectively activating different weights according to the task-related sparse masks. Finally, we conduct extensive experiments on 15 continual learning (CL) tasks, including conventional CL task settings (identical state and action spaces) and general CL task settings (various state and action spaces). Compared with 16 baselines, our method reaches the SOTA performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15698v1-abstract-full').style.display = 'none'; document.getElementById('2410.15698v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14853">arXiv:2410.14853</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14853">pdf</a>, <a href="https://arxiv.org/format/2410.14853">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DFlow: Diverse Dialogue Flow Simulation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Du%2C+W">Wanyu Du</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+S">Song Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Gung%2C+J">James Gung</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lijia Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Mansour%2C+S">Saab Mansour</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Y">Yanjun Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14853v1-abstract-short" style="display: inline;"> Developing language model-based dialogue agents requires effective data to train models that can follow specific task logic. However, most existing data augmentation methods focus on increasing diversity in language, topics, or dialogue acts at the utterance level, largely neglecting a critical aspect of task logic diversity at the dialogue level. This paper proposes a novel data augmentation meth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14853v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14853v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14853v1-abstract-full" style="display: none;"> Developing language model-based dialogue agents requires effective data to train models that can follow specific task logic. However, most existing data augmentation methods focus on increasing diversity in language, topics, or dialogue acts at the utterance level, largely neglecting a critical aspect of task logic diversity at the dialogue level. This paper proposes a novel data augmentation method designed to enhance the diversity of synthetic dialogues by focusing on task execution logic. Our method uses LLMs to generate decision tree-structured task plans, which enables the derivation of diverse dialogue trajectories for a given task. Each trajectory, referred to as a &#34;dialog flow&#34;, guides the generation of a multi-turn dialogue that follows a unique trajectory. We apply this method to generate a task-oriented dialogue dataset comprising 3,886 dialogue flows across 15 different domains. We validate the effectiveness of this dataset using the next action prediction task, where models fine-tuned on our dataset outperform strong baselines, including GPT-4. Upon acceptance of this paper, we plan to release the code and data publicly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14853v1-abstract-full').style.display = 'none'; document.getElementById('2410.14853v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13841">arXiv:2410.13841</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13841">pdf</a>, <a href="https://arxiv.org/format/2410.13841">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Unified View of Delta Parameter Editing in Post-Trained Large-Scale Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Q">Qiaoyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Le Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13841v1-abstract-short" style="display: inline;"> Post-training has emerged as a crucial paradigm for adapting large-scale pre-trained models to various tasks, whose effects are fully reflected by delta parameters (i.e., the disparity between post-trained and pre-trained parameters). While numerous studies have explored delta parameter properties via operations like pruning, quantization, low-rank approximation, and extrapolation, a unified frame&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13841v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13841v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13841v1-abstract-full" style="display: none;"> Post-training has emerged as a crucial paradigm for adapting large-scale pre-trained models to various tasks, whose effects are fully reflected by delta parameters (i.e., the disparity between post-trained and pre-trained parameters). While numerous studies have explored delta parameter properties via operations like pruning, quantization, low-rank approximation, and extrapolation, a unified framework for systematically examining these characteristics has been lacking. In this paper, we propose a novel perspective based on Riemann sum approximation of the loss function to elucidate delta parameter editing operations. Our analysis categorizes existing methods into three classes based on their post-editing performance: competitive, decreased, and improved, explaining how they are expressed by the Riemann sum approximation term and how they alter the model performance. Extensive experiments on both visual and language models, including ViT, LLaMA 3, Qwen 2, and Mistral, corroborate our theoretical findings. Furthermore, we introduce extensions to existing techniques like DARE and BitDelta, highlighting their limitations in leveraging the properties of delta parameters and reorganizing them into general expressions to enhance the applicability and effectiveness of delta parameter editing in post-trained models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13841v1-abstract-full').style.display = 'none'; document.getElementById('2410.13841v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13804">arXiv:2410.13804</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13804">pdf</a>, <a href="https://arxiv.org/format/2410.13804">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BenTo: Benchmark Task Reduction with In-Context Transferability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hongyu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tianyi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13804v3-abstract-short" style="display: inline;"> Evaluating large language models (LLMs) is costly: it requires the generation and examination of LLM outputs on a large-scale benchmark of various tasks. This paper investigates how to efficiently reduce the tasks used to benchmark LLMs without affecting the evaluation quality. Our study reveals that task transferability and relevance provide critical information to identify the most representativ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13804v3-abstract-full').style.display = 'inline'; document.getElementById('2410.13804v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13804v3-abstract-full" style="display: none;"> Evaluating large language models (LLMs) is costly: it requires the generation and examination of LLM outputs on a large-scale benchmark of various tasks. This paper investigates how to efficiently reduce the tasks used to benchmark LLMs without affecting the evaluation quality. Our study reveals that task transferability and relevance provide critical information to identify the most representative subset of tasks via optimizing a facility location function. We propose a practically efficient metric for estimating the transferability between two tasks via in-context learning (ICL). By analyzing the pairwise transferability, we can reduce tasks in a modern LLM benchmark (e.g., MMLU or FLAN) to 5% while inducing only a &lt;4% difference to the evaluation on the original benchmark. Compared to prior works, our method is training-free, gradient-free, and highly efficient requiring ICL only. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13804v3-abstract-full').style.display = 'none'; document.getElementById('2410.13804v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/tianyi-lab/bento</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13757">arXiv:2410.13757</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13757">pdf</a>, <a href="https://arxiv.org/format/2410.13757">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> MobA: A Two-Level Agent System for Efficient Mobile Task Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zichen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yansi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+K">Kunyao Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yixuan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yixiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Situo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Liangtai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13757v1-abstract-short" style="display: inline;"> Current mobile assistants are limited by dependence on system APIs or struggle with complex user instructions and diverse interfaces due to restricted comprehension and decision-making abilities. To address these challenges, we propose MobA, a novel Mobile phone Agent powered by multimodal large language models that enhances comprehension and planning capabilities through a sophisticated two-level&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13757v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13757v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13757v1-abstract-full" style="display: none;"> Current mobile assistants are limited by dependence on system APIs or struggle with complex user instructions and diverse interfaces due to restricted comprehension and decision-making abilities. To address these challenges, we propose MobA, a novel Mobile phone Agent powered by multimodal large language models that enhances comprehension and planning capabilities through a sophisticated two-level agent architecture. The high-level Global Agent (GA) is responsible for understanding user commands, tracking history memories, and planning tasks. The low-level Local Agent (LA) predicts detailed actions in the form of function calls, guided by sub-tasks and memory from the GA. Integrating a Reflection Module allows for efficient task completion and enables the system to handle previously unseen complex tasks. MobA demonstrates significant improvements in task execution efficiency and completion rate in real-life evaluations, underscoring the potential of MLLM-empowered mobile assistants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13757v1-abstract-full').style.display = 'none'; document.getElementById('2410.13757v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 6 figures, and 5 tables. We will release our source code in a few days</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13083">arXiv:2410.13083</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13083">pdf</a>, <a href="https://arxiv.org/format/2410.13083">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FedCAP: Robust Federated Learning via Customized Aggregation and Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Youpeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinda Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Fuxun Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wenbin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xuyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13083v1-abstract-short" style="display: inline;"> Federated learning (FL), an emerging distributed machine learning paradigm, has been applied to various privacy-preserving scenarios. However, due to its distributed nature, FL faces two key issues: the non-independent and identical distribution (non-IID) of user data and vulnerability to Byzantine threats. To address these challenges, in this paper, we propose FedCAP, a robust FL framework agains&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13083v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13083v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13083v1-abstract-full" style="display: none;"> Federated learning (FL), an emerging distributed machine learning paradigm, has been applied to various privacy-preserving scenarios. However, due to its distributed nature, FL faces two key issues: the non-independent and identical distribution (non-IID) of user data and vulnerability to Byzantine threats. To address these challenges, in this paper, we propose FedCAP, a robust FL framework against both data heterogeneity and Byzantine attacks. The core of FedCAP is a model update calibration mechanism to help a server capture the differences in the direction and magnitude of model updates among clients. Furthermore, we design a customized model aggregation rule that facilitates collaborative training among similar clients while accelerating the model deterioration of malicious clients. With a Euclidean norm-based anomaly detection mechanism, the server can quickly identify and permanently remove malicious clients. Moreover, the impact of data heterogeneity and Byzantine attacks can be further mitigated through personalization on the client side. We conduct extensive experiments, comparing multiple state-of-the-art baselines, to demonstrate that FedCAP performs well in several non-IID settings and shows strong robustness under a series of poisoning attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13083v1-abstract-full').style.display = 'none'; document.getElementById('2410.13083v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 12 figures, 5 tables, accepted by 2024 Annual Computer Security Applications Conference (ACSAC 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12475">arXiv:2410.12475</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12475">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Aegis:An Advanced LLM-Based Multi-Agent for Intelligent Functional Safety Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+L">Lu Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+B">Bin Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiarui Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Z">Zhanzhao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zhaowei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+W">Wenke Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lin Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12475v2-abstract-short" style="display: inline;"> Functional safety is a critical aspect of automotive engineering, encompassing all phases of a vehicle&#39;s lifecycle, including design, development, production, operation, and decommissioning. This domain involves highly knowledge-intensive tasks. This paper introduces Aegis: An Advanced LLM-Based Multi-Agent for Intelligent Functional Safety Engineering. Aegis is specifically designed to support co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12475v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12475v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12475v2-abstract-full" style="display: none;"> Functional safety is a critical aspect of automotive engineering, encompassing all phases of a vehicle&#39;s lifecycle, including design, development, production, operation, and decommissioning. This domain involves highly knowledge-intensive tasks. This paper introduces Aegis: An Advanced LLM-Based Multi-Agent for Intelligent Functional Safety Engineering. Aegis is specifically designed to support complex functional safety tasks within the automotive sector. It is tailored to perform Hazard Analysis and Risk Assessment(HARA), document Functional Safety Requirements(FSR), and plan test cases for Automatic Emergency Braking(AEB) systems. The most advanced version, Aegis-Max, leverages Retrieval-Augmented Generation(RAG) and reflective mechanisms to enhance its capability in managing complex, knowledge-intensive tasks. Additionally, targeted prompt refinement by professional functional safety practitioners can significantly optimize Aegis&#39;s performance in the functional safety domain. This paper demonstrates the potential of Aegis to improve the efficiency and effectiveness of functional safety processes in automotive engineering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12475v2-abstract-full').style.display = 'none'; document.getElementById('2410.12475v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12298">arXiv:2410.12298</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12298">pdf</a>, <a href="https://arxiv.org/format/2410.12298">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pyramid-Driven Alignment: Pyramid Principle Guided Integration of Large Language Models and Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinchen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Youdi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12298v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) possess impressive reasoning abilities but are prone to generating incorrect information, often referred to as hallucinations. While incorporating external Knowledge Graphs (KGs) can partially mitigate this issue, existing methods primarily treat KGs as static knowledge repositories, overlooking the critical disparity between KG and LLM knowledge, and failing to fully&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12298v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12298v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12298v2-abstract-full" style="display: none;"> Large Language Models (LLMs) possess impressive reasoning abilities but are prone to generating incorrect information, often referred to as hallucinations. While incorporating external Knowledge Graphs (KGs) can partially mitigate this issue, existing methods primarily treat KGs as static knowledge repositories, overlooking the critical disparity between KG and LLM knowledge, and failing to fully exploit the reasoning capabilities inherent in KGs. To address these limitations, we propose Pyramid-Driven Alignment (PDA), a novel framework for seamlessly integrating LLMs with KGs. PDA utilizes Pyramid Principle analysis to construct a hierarchical pyramid structure. This structure is designed to reflect the input question and generate more validated deductive knowledge, thereby enhancing the alignment of LLMs and KGs and ensuring more cohesive integration. Furthermore, PDA employs a recursive mechanism to harness the underlying reasoning abilities of KGs, resulting in more accurate knowledge retrieval for question-answering tasks. Our experimental results reveal a substantial performance advantage of PDA over state-of-the-art baselines, with improvements reaching 26.70% and 26.78%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12298v2-abstract-full').style.display = 'none'; document.getElementById('2410.12298v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11651">arXiv:2410.11651</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11651">pdf</a>, <a href="https://arxiv.org/format/2410.11651">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RS-MOCO: A deep learning-based topology-preserving image registration method for cardiac T1 mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chiyi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Longwei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+H">Haifeng Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+H">Hongwu Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yanjie Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11651v1-abstract-short" style="display: inline;"> Cardiac T1 mapping can evaluate various clinical symptoms of myocardial tissue. However, there is currently a lack of effective, robust, and efficient methods for motion correction in cardiac T1 mapping. In this paper, we propose a deep learning-based and topology-preserving image registration framework for motion correction in cardiac T1 mapping. Notably, our proposed implicit consistency constra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11651v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11651v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11651v1-abstract-full" style="display: none;"> Cardiac T1 mapping can evaluate various clinical symptoms of myocardial tissue. However, there is currently a lack of effective, robust, and efficient methods for motion correction in cardiac T1 mapping. In this paper, we propose a deep learning-based and topology-preserving image registration framework for motion correction in cardiac T1 mapping. Notably, our proposed implicit consistency constraint dubbed BLOC, to some extent preserves the image topology in registration by bidirectional consistency constraint and local anti-folding constraint. To address the contrast variation issue, we introduce a weighted image similarity metric for multimodal registration of cardiac T1-weighted images. Besides, a semi-supervised myocardium segmentation network and a dual-domain attention module are integrated into the framework to further improve the performance of the registration. Numerous comparative experiments, as well as ablation studies, demonstrated the effectiveness and high robustness of our method. The results also indicate that the proposed weighted image similarity metric, specifically crafted for our network, contributes a lot to the enhancement of the motion correction efficacy, while the bidirectional consistency constraint combined with the local anti-folding constraint ensures a more desirable topology-preserving registration mapping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11651v1-abstract-full').style.display = 'none'; document.getElementById('2410.11651v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09133">arXiv:2410.09133</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09133">pdf</a>, <a href="https://arxiv.org/format/2410.09133">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MVG-CRPS: A Robust Loss Function for Multivariate Probabilistic Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+V+Z">Vincent Zhihao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lijun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09133v1-abstract-short" style="display: inline;"> In probabilistic time series forecasting, the multivariate Gaussian (MVG) distribution is widely used as predictive distribution for correlated continuous random variables. Current deep probabilistic models typically employ neural networks to parameterize the mean vector and covariance matrix of the distribution, with log-score (i.e., negative log-likelihood) as the default loss function. However,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09133v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09133v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09133v1-abstract-full" style="display: none;"> In probabilistic time series forecasting, the multivariate Gaussian (MVG) distribution is widely used as predictive distribution for correlated continuous random variables. Current deep probabilistic models typically employ neural networks to parameterize the mean vector and covariance matrix of the distribution, with log-score (i.e., negative log-likelihood) as the default loss function. However, log-score is highly sensitive to outliers, leading to significant errors when anomalies are present in the data. Motivated by the use of the continuous ranked probability score (CRPS) in learning univariate distributions, we propose a robust loss function specifically designed for high-dimensional MVG outputs. The proposed MVG-CRPS loss function has a closed-form expression based on the neural network outputs, making it easily integrable into deep learning models. We evaluate MVG-CRPS on two probabilistic forecasting tasks -- multivariate autoregressive and univariate sequence-to-sequence (Seq2Seq) forecasting -- both involving observations following MVG distribution. Experimental results on real-world datasets demonstrate that MVG-CRPS achieves both robustness and efficiency, offering enhanced accuracy and uncertainty quantification in probabilistic forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09133v1-abstract-full').style.display = 'none'; document.getElementById('2410.09133v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08815">arXiv:2410.08815</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08815">pdf</a>, <a href="https://arxiv.org/format/2410.08815">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StructRAG: Boosting Knowledge Intensive Reasoning of LLMs via Inference-time Hybrid Information Structurization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhuoqun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuanang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Q">Qiaoyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yongbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08815v2-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) is a key means to effectively enhance large language models (LLMs) in many knowledge-based tasks. However, existing RAG methods struggle with knowledge-intensive reasoning tasks, because useful information required to these tasks are badly scattered. This characteristic makes it difficult for existing RAG methods to accurately identify key information and perfo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08815v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08815v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08815v2-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) is a key means to effectively enhance large language models (LLMs) in many knowledge-based tasks. However, existing RAG methods struggle with knowledge-intensive reasoning tasks, because useful information required to these tasks are badly scattered. This characteristic makes it difficult for existing RAG methods to accurately identify key information and perform global reasoning with such noisy augmentation. In this paper, motivated by the cognitive theories that humans convert raw information into various structured knowledge when tackling knowledge-intensive reasoning, we proposes a new framework, StructRAG, which can identify the optimal structure type for the task at hand, reconstruct original documents into this structured format, and infer answers based on the resulting structure. Extensive experiments across various knowledge-intensive tasks show that StructRAG achieves state-of-the-art performance, particularly excelling in challenging scenarios, demonstrating its potential as an effective solution for enhancing LLMs in complex real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08815v2-abstract-full').style.display = 'none'; document.getElementById('2410.08815v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07804">arXiv:2410.07804</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07804">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Intuitive interaction flow: A Dual-Loop Human-Machine Collaboration Task Allocation Model and an experimental study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+Q">Qiyang Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Ziyuan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yilin Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lingyun Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tianyang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+J">Jingru Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Q">Qichao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07804v3-abstract-short" style="display: inline;"> This study investigates the issue of task allocation in Human-Machine Collaboration (HMC) within the context of Industry 4.0. By integrating philosophical insights and cognitive science, it clearly defines two typical modes of human behavior in human-machine interaction(HMI): skill-based intuitive behavior and knowledge-based intellectual behavior. Building on this, the concept of &#39;intuitive inter&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07804v3-abstract-full').style.display = 'inline'; document.getElementById('2410.07804v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07804v3-abstract-full" style="display: none;"> This study investigates the issue of task allocation in Human-Machine Collaboration (HMC) within the context of Industry 4.0. By integrating philosophical insights and cognitive science, it clearly defines two typical modes of human behavior in human-machine interaction(HMI): skill-based intuitive behavior and knowledge-based intellectual behavior. Building on this, the concept of &#39;intuitive interaction flow&#39; is innovatively introduced by combining human intuition with machine humanoid intelligence, leading to the construction of a dual-loop HMC task allocation model. Through comparative experiments measuring electroencephalogram (EEG) and electromyogram (EMG) activities, distinct physiological patterns associated with these behavior modes are identified, providing a preliminary foundation for future adaptive HMC frameworks. This work offers a pathway for developing intelligent HMC systems that effectively integrate human intuition and machine intelligence in Industry 4.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07804v3-abstract-full').style.display = 'none'; document.getElementById('2410.07804v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07693">arXiv:2410.07693</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07693">pdf</a>, <a href="https://arxiv.org/format/2410.07693">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Facet Counterfactual Learning for Content Quality Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+J">Jiasheng Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+M">Meng Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07693v1-abstract-short" style="display: inline;"> Evaluating the quality of documents is essential for filtering valuable content from the current massive amount of information. Conventional approaches typically rely on a single score as a supervision signal for training content quality evaluators, which is inadequate to differentiate documents with quality variations across multiple facets. In this paper, we propose Multi-facet cOunterfactual LE&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07693v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07693v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07693v1-abstract-full" style="display: none;"> Evaluating the quality of documents is essential for filtering valuable content from the current massive amount of information. Conventional approaches typically rely on a single score as a supervision signal for training content quality evaluators, which is inadequate to differentiate documents with quality variations across multiple facets. In this paper, we propose Multi-facet cOunterfactual LEarning (MOLE), a framework for efficiently constructing evaluators that perceive multiple facets of content quality evaluation. Given a specific scenario, we prompt large language models to generate counterfactual content that exhibits variations in critical quality facets compared to the original document. Furthermore, we leverage a joint training strategy based on contrastive learning and supervised learning to enable the evaluator to distinguish between different quality facets, resulting in more accurate predictions of content quality scores. Experimental results on 2 datasets across different scenarios demonstrate that our proposed MOLE framework effectively improves the correlation of document content quality evaluations with human judgments, which serve as a valuable toolkit for effective information acquisition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07693v1-abstract-full').style.display = 'none'; document.getElementById('2410.07693v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07561">arXiv:2410.07561</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07561">pdf</a>, <a href="https://arxiv.org/format/2410.07561">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AI-Press: A Multi-Agent News Generating and Feedback Simulation System Powered by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiawei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shiyue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinnong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Kuang%2C+H">Haoyu Kuang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Libo Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yihang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Siming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07561v1-abstract-short" style="display: inline;"> The rise of various social platforms has transformed journalism. The growing demand for news content has led to the increased use of large language models (LLMs) in news production due to their speed and cost-effectiveness. However, LLMs still encounter limitations in professionalism and ethical judgment in news generation. Additionally, predicting public feedback is usually difficult before news&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07561v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07561v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07561v1-abstract-full" style="display: none;"> The rise of various social platforms has transformed journalism. The growing demand for news content has led to the increased use of large language models (LLMs) in news production due to their speed and cost-effectiveness. However, LLMs still encounter limitations in professionalism and ethical judgment in news generation. Additionally, predicting public feedback is usually difficult before news is released. To tackle these challenges, we introduce AI-Press, an automated news drafting and polishing system based on multi-agent collaboration and Retrieval-Augmented Generation. We develop a feedback simulation system that generates public feedback considering demographic distributions. Through extensive quantitative and qualitative evaluations, our system shows significant improvements in news-generating capabilities and verifies the effectiveness of public feedback simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07561v1-abstract-full').style.display = 'none'; document.getElementById('2410.07561v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07066">arXiv:2410.07066</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07066">pdf</a>, <a href="https://arxiv.org/format/2410.07066">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Gentle Introduction and Tutorial on Deep Generative Models in Transportation Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Choi%2C+S">Seongjin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zhixiong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Ham%2C+S+W">Seung Woo Ham</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Jiwon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lijun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07066v2-abstract-short" style="display: inline;"> Deep Generative Models (DGMs) have rapidly advanced in recent years, becoming essential tools in various fields due to their ability to learn complex data distributions and generate synthetic data. Their importance in transportation research is increasingly recognized, particularly for applications like traffic data generation, prediction, and feature extraction. This paper offers a comprehensive&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07066v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07066v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07066v2-abstract-full" style="display: none;"> Deep Generative Models (DGMs) have rapidly advanced in recent years, becoming essential tools in various fields due to their ability to learn complex data distributions and generate synthetic data. Their importance in transportation research is increasingly recognized, particularly for applications like traffic data generation, prediction, and feature extraction. This paper offers a comprehensive introduction and tutorial on DGMs, with a focus on their applications in transportation. It begins with an overview of generative models, followed by detailed explanations of fundamental models, a systematic review of the literature, and practical tutorial code to aid implementation. The paper also discusses current challenges and opportunities, highlighting how these models can be effectively utilized and further developed in transportation research. This paper serves as a valuable reference, guiding researchers and practitioners from foundational knowledge to advanced applications of DGMs in transportation research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07066v2-abstract-full').style.display = 'none'; document.getElementById('2410.07066v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">64 pages, 21 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06802">arXiv:2410.06802</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06802">pdf</a>, <a href="https://arxiv.org/format/2410.06802">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Seg2Act: Global Context-aware Action Generation for Document Logical Structuring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zichao Li</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Shaojie He</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+M">Meng Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuanang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yanxiong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06802v1-abstract-short" style="display: inline;"> Document logical structuring aims to extract the underlying hierarchical structure of documents, which is crucial for document intelligence. Traditional approaches often fall short in handling the complexity and the variability of lengthy documents. To address these issues, we introduce Seg2Act, an end-to-end, generation-based method for document logical structuring, revisiting logical structure e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06802v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06802v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06802v1-abstract-full" style="display: none;"> Document logical structuring aims to extract the underlying hierarchical structure of documents, which is crucial for document intelligence. Traditional approaches often fall short in handling the complexity and the variability of lengthy documents. To address these issues, we introduce Seg2Act, an end-to-end, generation-based method for document logical structuring, revisiting logical structure extraction as an action generation task. Specifically, given the text segments of a document, Seg2Act iteratively generates the action sequence via a global context-aware generative model, and simultaneously updates its global context and current logical structure based on the generated actions. Experiments on ChCatExt and HierDoc datasets demonstrate the superior performance of Seg2Act in both supervised and transfer learning settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06802v1-abstract-full').style.display = 'none'; document.getElementById('2410.06802v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06652">arXiv:2410.06652</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06652">pdf</a>, <a href="https://arxiv.org/format/2410.06652">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Task-oriented Time Series Imputation Evaluation via Generalized Representers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhixian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Linxiao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Liang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06652v2-abstract-short" style="display: inline;"> Time series analysis is widely used in many fields such as power energy, economics, and transportation, including different tasks such as forecasting, anomaly detection, classification, etc. Missing values are widely observed in these tasks, and often leading to unpredictable negative effects on existing methods, hindering their further application. In response to this situation, existing time ser&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06652v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06652v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06652v2-abstract-full" style="display: none;"> Time series analysis is widely used in many fields such as power energy, economics, and transportation, including different tasks such as forecasting, anomaly detection, classification, etc. Missing values are widely observed in these tasks, and often leading to unpredictable negative effects on existing methods, hindering their further application. In response to this situation, existing time series imputation methods mainly focus on restoring sequences based on their data characteristics, while ignoring the performance of the restored sequences in downstream tasks. Considering different requirements of downstream tasks (e.g., forecasting), this paper proposes an efficient downstream task-oriented time series imputation evaluation approach. By combining time series imputation with neural network models used for downstream tasks, the gain of different imputation strategies on downstream tasks is estimated without retraining, and the most favorable imputation value for downstream tasks is given by combining different imputation strategies according to the estimated gain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06652v2-abstract-full').style.display = 'none'; document.getElementById('2410.06652v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 9 figures, 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06560">arXiv:2410.06560</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06560">pdf</a>, <a href="https://arxiv.org/format/2410.06560">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Time Discretization Challenges with WeatherODE: A Sandwich Physics-Driven Neural ODE for Weather Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Peiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Liang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+R">Rong Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06560v1-abstract-short" style="display: inline;"> In the field of weather forecasting, traditional models often grapple with discretization errors and time-dependent source discrepancies, which limit their predictive performance. In this paper, we present WeatherODE, a novel one-stage, physics-driven ordinary differential equation (ODE) model designed to enhance weather forecasting accuracy. By leveraging wave equation theory and integrating a ti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06560v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06560v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06560v1-abstract-full" style="display: none;"> In the field of weather forecasting, traditional models often grapple with discretization errors and time-dependent source discrepancies, which limit their predictive performance. In this paper, we present WeatherODE, a novel one-stage, physics-driven ordinary differential equation (ODE) model designed to enhance weather forecasting accuracy. By leveraging wave equation theory and integrating a time-dependent source model, WeatherODE effectively addresses the challenges associated with time-discretization error and dynamic atmospheric processes. Moreover, we design a CNN-ViT-CNN sandwich structure, facilitating efficient learning dynamics tailored for distinct yet interrelated tasks with varying optimization biases in advection equation estimation. Through rigorous experiments, WeatherODE demonstrates superior performance in both global and regional weather forecasting tasks, outperforming recent state-of-the-art approaches by significant margins of over 40.0\% and 31.8\% in root mean square error (RMSE), respectively. The source code is available at \url{https://github.com/DAMO-DI-ML/WeatherODE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06560v1-abstract-full').style.display = 'none'; document.getElementById('2410.06560v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06104">arXiv:2410.06104</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06104">pdf</a>, <a href="https://arxiv.org/format/2410.06104">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RefineStyle: Dynamic Convolution Refinement for StyleGAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xia%2C+S">Siwei Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xueqi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qingli Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06104v1-abstract-short" style="display: inline;"> In StyleGAN, convolution kernels are shaped by both static parameters shared across images and dynamic modulation factors $w^+\in\mathcal{W}^+$ specific to each image. Therefore, $\mathcal{W}^+$ space is often used for image inversion and editing. However, pre-trained model struggles with synthesizing out-of-domain images due to the limited capabilities of $\mathcal{W}^+$ and its resultant kernels&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06104v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06104v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06104v1-abstract-full" style="display: none;"> In StyleGAN, convolution kernels are shaped by both static parameters shared across images and dynamic modulation factors $w^+\in\mathcal{W}^+$ specific to each image. Therefore, $\mathcal{W}^+$ space is often used for image inversion and editing. However, pre-trained model struggles with synthesizing out-of-domain images due to the limited capabilities of $\mathcal{W}^+$ and its resultant kernels, necessitating full fine-tuning or adaptation through a complex hypernetwork. This paper proposes an efficient refining strategy for dynamic kernels. The key idea is to modify kernels by low-rank residuals, learned from input image or domain guidance. These residuals are generated by matrix multiplication between two sets of tokens with the same number, which controls the complexity. We validate the refining scheme in image inversion and domain adaptation. In the former task, we design grouped transformer blocks to learn these token sets by one- or two-stage training. In the latter task, token sets are directly optimized to support synthesis in the target domain while preserving original content. Extensive experiments show that our method achieves low distortions for image inversion and high quality for out-of-domain editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06104v1-abstract-full').style.display = 'none'; document.getElementById('2410.06104v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by PRCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05726">arXiv:2410.05726</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05726">pdf</a>, <a href="https://arxiv.org/format/2410.05726">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Less is more: Embracing sparsity and interpolation with Esiformer for time series forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Y">Yangyang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yanjun Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Dang%2C+S">Sizhe Dang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Liang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Y">Yi Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05726v1-abstract-short" style="display: inline;"> Time series forecasting has played a significant role in many practical fields. But time series data generated from real-world applications always exhibits high variance and lots of noise, which makes it difficult to capture the inherent periodic patterns of the data, hurting the prediction accuracy significantly. To address this issue, we propose the Esiformer, which apply interpolation on the or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05726v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05726v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05726v1-abstract-full" style="display: none;"> Time series forecasting has played a significant role in many practical fields. But time series data generated from real-world applications always exhibits high variance and lots of noise, which makes it difficult to capture the inherent periodic patterns of the data, hurting the prediction accuracy significantly. To address this issue, we propose the Esiformer, which apply interpolation on the original data, decreasing the overall variance of the data and alleviating the influence of noise. What&#39;s more, we enhanced the vanilla transformer with a robust Sparse FFN. It can enhance the representation ability of the model effectively, and maintain the excellent robustness, avoiding the risk of overfitting compared with the vanilla implementation. Through evaluations on challenging real-world datasets, our method outperforms leading model PatchTST, reducing MSE by 6.5% and MAE by 5.8% in multivariate time series forecasting. Code is available at: https://github.com/yyg1282142265/Esiformer/tree/main. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05726v1-abstract-full').style.display = 'none'; document.getElementById('2410.05726v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05584">arXiv:2410.05584</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05584">pdf</a>, <a href="https://arxiv.org/format/2410.05584">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Reward Model Evaluation: Are We Barking up the Wrong Tree? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wen%2C+X">Xueru Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Lou%2C+J">Jie Lou</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xing Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Debing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05584v2-abstract-short" style="display: inline;"> Reward Models (RMs) are crucial for aligning language models with human preferences. Currently, the evaluation of RMs depends on measuring accuracy against a validation set of manually annotated preference data. Although this method is straightforward and widely adopted, the relationship between RM accuracy and downstream policy performance remains under-explored. In this work, we conduct experime&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05584v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05584v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05584v2-abstract-full" style="display: none;"> Reward Models (RMs) are crucial for aligning language models with human preferences. Currently, the evaluation of RMs depends on measuring accuracy against a validation set of manually annotated preference data. Although this method is straightforward and widely adopted, the relationship between RM accuracy and downstream policy performance remains under-explored. In this work, we conduct experiments in a synthetic setting to investigate how differences in RM measured by accuracy translate into gaps in optimized policy performance. Our findings reveal that while there is a weak positive correlation between accuracy and downstream performance, policies optimized towards RMs with similar accuracy can exhibit quite different performance. Moreover, we discover that the way of measuring accuracy significantly impacts its ability to predict the final policy performance. Through the lens of Regressional Goodhart&#39;s effect, we identify the existence of exogenous variables impacting the relationship between RM quality measured by accuracy and policy model capability. This underscores the inadequacy of relying solely on accuracy to reflect their impact on policy optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05584v2-abstract-full').style.display = 'none'; document.getElementById('2410.05584v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04013">arXiv:2410.04013</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04013">pdf</a>, <a href="https://arxiv.org/format/2410.04013">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Temporal Link Prediction via Temporal Walk Matrix Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xiaodong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Leilei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+T">Tongyu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+W">Weifeng Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04013v1-abstract-short" style="display: inline;"> Temporal link prediction, aiming at predicting future interactions among entities based on historical interactions, is crucial for a series of real-world applications. Although previous methods have demonstrated the importance of relative encodings for effective temporal link prediction, computational efficiency remains a major concern in constructing these encodings. Moreover, existing relative e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04013v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04013v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04013v1-abstract-full" style="display: none;"> Temporal link prediction, aiming at predicting future interactions among entities based on historical interactions, is crucial for a series of real-world applications. Although previous methods have demonstrated the importance of relative encodings for effective temporal link prediction, computational efficiency remains a major concern in constructing these encodings. Moreover, existing relative encodings are usually constructed based on structural connectivity, where temporal information is seldom considered. To address the aforementioned issues, we first analyze existing relative encodings and unify them as a function of temporal walk matrices. This unification establishes a connection between relative encodings and temporal walk matrices, providing a more principled way for analyzing and designing relative encodings. Based on this analysis, we propose a new temporal graph neural network called TPNet, which introduces a temporal walk matrix that incorporates the time decay effect to simultaneously consider both temporal and structural information. Moreover, TPNet designs a random feature propagation mechanism with theoretical guarantees to implicitly maintain the temporal walk matrices, which improves the computation and storage efficiency. Experimental results on 13 benchmark datasets verify the effectiveness and efficiency of TPNet, where TPNet outperforms other baselines on most datasets and achieves a maximum speedup of $33.3 \times$ compared to the SOTA baseline. Our code can be found at \url{https://github.com/lxd99/TPNet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04013v1-abstract-full').style.display = 'none'; document.getElementById('2410.04013v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03417">arXiv:2410.03417</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03417">pdf</a>, <a href="https://arxiv.org/format/2410.03417">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Img2CAD: Conditioned 3D CAD Model Generation from Single Image with Structured Visual Geometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tianrun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chunan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yuanqi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+R">Runlong Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zang%2C+Y">Ying Zang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zejian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Linyun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03417v1-abstract-short" style="display: inline;"> In this paper, we propose Img2CAD, the first approach to our knowledge that uses 2D image inputs to generate CAD models with editable parameters. Unlike existing AI methods for 3D model generation using text or image inputs often rely on mesh-based representations, which are incompatible with CAD tools and lack editability and fine control, Img2CAD enables seamless integration between AI-based 3D&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03417v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03417v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03417v1-abstract-full" style="display: none;"> In this paper, we propose Img2CAD, the first approach to our knowledge that uses 2D image inputs to generate CAD models with editable parameters. Unlike existing AI methods for 3D model generation using text or image inputs often rely on mesh-based representations, which are incompatible with CAD tools and lack editability and fine control, Img2CAD enables seamless integration between AI-based 3D reconstruction and CAD software. We have identified an innovative intermediate representation called Structured Visual Geometry (SVG), characterized by vectorized wireframes extracted from objects. This representation significantly enhances the performance of generating conditioned CAD models. Additionally, we introduce two new datasets to further support research in this area: ABC-mono, the largest known dataset comprising over 200,000 3D CAD models with rendered images, and KOCAD, the first dataset featuring real-world captured objects alongside their ground truth CAD models, supporting further research in conditioned CAD model generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03417v1-abstract-full').style.display = 'none'; document.getElementById('2410.03417v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03027">arXiv:2410.03027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03027">pdf</a>, <a href="https://arxiv.org/format/2410.03027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MLP-KAN: Unifying Deep Representation and Function Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yunhong He</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yifeng Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Z">Zhengqing Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03027v1-abstract-short" style="display: inline;"> Recent advancements in both representation learning and function learning have demonstrated substantial promise across diverse domains of artificial intelligence. However, the effective integration of these paradigms poses a significant challenge, particularly in cases where users must manually decide whether to apply a representation learning or function learning model based on dataset characteri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03027v1-abstract-full" style="display: none;"> Recent advancements in both representation learning and function learning have demonstrated substantial promise across diverse domains of artificial intelligence. However, the effective integration of these paradigms poses a significant challenge, particularly in cases where users must manually decide whether to apply a representation learning or function learning model based on dataset characteristics. To address this issue, we introduce MLP-KAN, a unified method designed to eliminate the need for manual model selection. By integrating Multi-Layer Perceptrons (MLPs) for representation learning and Kolmogorov-Arnold Networks (KANs) for function learning within a Mixture-of-Experts (MoE) architecture, MLP-KAN dynamically adapts to the specific characteristics of the task at hand, ensuring optimal performance. Embedded within a transformer-based framework, our work achieves remarkable results on four widely-used datasets across diverse domains. Extensive experimental evaluation demonstrates its superior versatility, delivering competitive performance across both deep representation and function learning tasks. These findings highlight the potential of MLP-KAN to simplify the model selection process, offering a comprehensive, adaptable solution across various domains. Our code and weights are available at \url{https://github.com/DLYuanGod/MLP-KAN}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03027v1-abstract-full').style.display = 'none'; document.getElementById('2410.03027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01495">arXiv:2410.01495</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01495">pdf</a>, <a href="https://arxiv.org/format/2410.01495">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Open-vocabulary Multimodal Emotion Recognition: Dataset, Metric, and Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lian%2C+Z">Zheng Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Licai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Z">Zhuofan Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Siyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Hailiang Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mingyu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+S">Shan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Ya Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01495v1-abstract-short" style="display: inline;"> Multimodal Emotion Recognition (MER) is an important research topic. This paper advocates for a transformative paradigm in MER. The rationale behind our work is that current approaches often rely on a limited set of basic emotion labels, which do not adequately represent the rich spectrum of human emotions. These traditional and overly simplistic emotion categories fail to capture the inherent com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01495v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01495v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01495v1-abstract-full" style="display: none;"> Multimodal Emotion Recognition (MER) is an important research topic. This paper advocates for a transformative paradigm in MER. The rationale behind our work is that current approaches often rely on a limited set of basic emotion labels, which do not adequately represent the rich spectrum of human emotions. These traditional and overly simplistic emotion categories fail to capture the inherent complexity and subtlety of human emotional experiences, leading to limited generalizability and practicality. Therefore, we propose a new MER paradigm called Open-vocabulary MER (OV-MER), which encompasses a broader range of emotion labels to reflect the richness of human emotions. This paradigm relaxes the label space, allowing for the prediction of arbitrary numbers and categories of emotions. To support this transition, we provide a comprehensive solution that includes a newly constructed database based on LLM and human collaborative annotations, along with corresponding metrics and a series of benchmarks. We hope this work advances emotion recognition from basic emotions to more nuanced emotions, contributing to the development of emotional AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01495v1-abstract-full').style.display = 'none'; document.getElementById('2410.01495v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00878">arXiv:2410.00878</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00878">pdf</a>, <a href="https://arxiv.org/format/2410.00878">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Empirical Perturbation Analysis of Linear System Solvers from a Data Poisoning Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yixin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Carr%2C+A">Arielle Carr</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Lichao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00878v1-abstract-short" style="display: inline;"> The perturbation analysis of linear solvers applied to systems arising broadly in machine learning settings -- for instance, when using linear regression models -- establishes an important perspective when reframing these analyses through the lens of a data poisoning attack. By analyzing solvers&#39; responses to such attacks, this work aims to contribute to the development of more robust linear solve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00878v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00878v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00878v1-abstract-full" style="display: none;"> The perturbation analysis of linear solvers applied to systems arising broadly in machine learning settings -- for instance, when using linear regression models -- establishes an important perspective when reframing these analyses through the lens of a data poisoning attack. By analyzing solvers&#39; responses to such attacks, this work aims to contribute to the development of more robust linear solvers and provide insights into poisoning attacks on linear solvers. In particular, we investigate how the errors in the input data will affect the fitting error and accuracy of the solution from a linear system-solving algorithm under perturbations common in adversarial attacks. We propose data perturbation through two distinct knowledge levels, developing a poisoning optimization and studying two methods of perturbation: Label-guided Perturbation (LP) and Unconditioning Perturbation (UP). Existing works mainly focus on deriving the worst-case perturbation bound from a theoretical perspective, and the analysis is often limited to specific kinds of linear system solvers. Under the circumstance that the data is intentionally perturbed -- as is the case with data poisoning -- we seek to understand how different kinds of solvers react to these perturbations, identifying those algorithms most impacted by different types of adversarial attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00878v1-abstract-full').style.display = 'none'; document.getElementById('2410.00878v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Sun%2C+L&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10