CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 234 results for author: <span class="mathjax">Qi, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Qi%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Qi, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Qi%2C+J&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Qi, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13817">arXiv:2411.13817</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13817">pdf</a>, <a href="https://arxiv.org/format/2411.13817">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Structural Clustering Unleashed: Flexible Similarities, Versatile Updates and for All Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhuowei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+J">Junhao Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+B">Boyu Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+Z">Zhifeng Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Sibo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13817v1-abstract-short" style="display: inline;"> We study structural clustering on graphs in dynamic scenarios, where the graphs can be updated by arbitrary insertions or deletions of edges/vertices. The goal is to efficiently compute structural clustering results for any clustering parameters $蔚$ and $渭$ given on the fly, for arbitrary graph update patterns, and for all typical similarity measurements. Specifically, we adopt the idea of update&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13817v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13817v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13817v1-abstract-full" style="display: none;"> We study structural clustering on graphs in dynamic scenarios, where the graphs can be updated by arbitrary insertions or deletions of edges/vertices. The goal is to efficiently compute structural clustering results for any clustering parameters $蔚$ and $渭$ given on the fly, for arbitrary graph update patterns, and for all typical similarity measurements. Specifically, we adopt the idea of update affordability and propose an a-lot-simpler yet more efficient (both theoretically and practically) algorithm (than state of the art), named VD-STAR to handle graph updates. First, with a theoretical clustering result quality guarantee, VD-STAR can output high-quality clustering results with up to 99.9% accuracy. Second, our VD-STAR is easy to implement as it just needs to maintain certain sorted linked lists and hash tables, and hence, effectively enhances its deployment in practice. Third and most importantly, by careful analysis, VD-STAR improves the per-update time bound of the state-of-the-art from $O(\log^2 n)$ expected with certain update pattern assumption to $O(\log n)$ amortized in expectation without any update pattern assumption. We further design two variants of VD-STAR to enhance its empirical performance. Experimental results show that our algorithms consistently outperform the state-of-the-art competitors by up to 9,315 times in update time across nine real datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13817v1-abstract-full').style.display = 'none'; document.getElementById('2411.13817v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12556">arXiv:2411.12556</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12556">pdf</a>, <a href="https://arxiv.org/format/2411.12556">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UMGAD: Unsupervised Multiplex Graph Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianpeng Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhongying Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+G">Guanjie Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+L">Lei Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Junyu Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yanwei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12556v1-abstract-short" style="display: inline;"> Graph anomaly detection (GAD) is a critical task in graph machine learning, with the primary objective of identifying anomalous nodes that deviate significantly from the majority. This task is widely applied in various real-world scenarios, including fraud detection and social network analysis. However, existing GAD methods still face two major challenges: (1) They are often limited to detecting a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12556v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12556v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12556v1-abstract-full" style="display: none;"> Graph anomaly detection (GAD) is a critical task in graph machine learning, with the primary objective of identifying anomalous nodes that deviate significantly from the majority. This task is widely applied in various real-world scenarios, including fraud detection and social network analysis. However, existing GAD methods still face two major challenges: (1) They are often limited to detecting anomalies in single-type interaction graphs and struggle with multiple interaction types in multiplex heterogeneous graphs; (2) In unsupervised scenarios, selecting appropriate anomaly score thresholds remains a significant challenge for accurate anomaly detection. To address the above challenges, we propose a novel Unsupervised Multiplex Graph Anomaly Detection method, named UMGAD. We first learn multi-relational correlations among nodes in multiplex heterogeneous graphs and capture anomaly information during node attribute and structure reconstruction through graph-masked autoencoder (GMAE). Then, to further weaken the influence of noise and redundant information on abnormal information extraction, we generate attribute-level and subgraph-level augmented-view graphs respectively, and perform attribute and structure reconstruction through GMAE. Finally, We learn to optimize node attributes and structural features through contrastive learning between original-view and augmented-view graphs to improve the model&#39;s ability to capture anomalies. Meanwhile, we also propose a new anomaly score threshold selection strategy, which allows the model to be independent of the ground truth in real unsupervised scenarios. Extensive experiments on four datasets show that our \model significantly outperforms state-of-the-art methods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1 across all datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12556v1-abstract-full').style.display = 'none'; document.getElementById('2411.12556v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09403">arXiv:2411.09403</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09403">pdf</a>, <a href="https://arxiv.org/format/2411.09403">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quantum Machine Learning: An Interplay Between Quantum Computing and Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jun Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chao-Han Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+P">Pin-Yu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09403v1-abstract-short" style="display: inline;"> Quantum machine learning (QML) is a rapidly growing field that combines quantum computing principles with traditional machine learning. It seeks to revolutionize machine learning by harnessing the unique capabilities of quantum mechanics and employs machine learning techniques to advance quantum computing research. This paper introduces quantum computing for the machine learning paradigm, where va&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09403v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09403v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09403v1-abstract-full" style="display: none;"> Quantum machine learning (QML) is a rapidly growing field that combines quantum computing principles with traditional machine learning. It seeks to revolutionize machine learning by harnessing the unique capabilities of quantum mechanics and employs machine learning techniques to advance quantum computing research. This paper introduces quantum computing for the machine learning paradigm, where variational quantum circuits (VQC) are used to develop QML architectures on noisy intermediate-scale quantum (NISQ) devices. We discuss machine learning for the quantum computing paradigm, showcasing our recent theoretical and empirical findings. In particular, we delve into future directions for studying QML, exploring the potential industrial impacts of QML research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09403v1-abstract-full').style.display = 'none'; document.getElementById('2411.09403v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09389">arXiv:2411.09389</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09389">pdf</a>, <a href="https://arxiv.org/format/2411.09389">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Unseen Domain Fake News Detection via Causal Propagation Substructures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gong%2C+S">Shuzhi Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Sinnott%2C+R+O">Richard O. Sinnott</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Paris%2C+C">Cecile Paris</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09389v1-abstract-short" style="display: inline;"> The spread of fake news on social media poses significant threats to individuals and society. Text-based and graph-based models have been employed for fake news detection by analysing news content and propagation networks, showing promising results in specific scenarios. However, these data-driven models heavily rely on pre-existing in-distribution data for training, limiting their performance whe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09389v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09389v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09389v1-abstract-full" style="display: none;"> The spread of fake news on social media poses significant threats to individuals and society. Text-based and graph-based models have been employed for fake news detection by analysing news content and propagation networks, showing promising results in specific scenarios. However, these data-driven models heavily rely on pre-existing in-distribution data for training, limiting their performance when confronted with fake news from emerging or previously unseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news is a challenging yet critical task. In this paper, we introduce the Causal Subgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to enhance zero-shot fake news detection by extracting causal substructures from propagation graphs using in-distribution data and generalising this approach to OOD data. The model employs a graph neural network based mask generation process to identify dominant nodes and edges within the propagation graph, using these substructures for fake news detection. Additionally, the performance of CSDA is further improved through contrastive learning in few-shot scenarios, where a limited amount of OOD data is available for training. Extensive experiments on public social media datasets demonstrate that CSDA effectively handles OOD fake news detection, achieving a 7 to 16 percents accuracy improvement over other state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09389v1-abstract-full').style.display = 'none'; document.getElementById('2411.09389v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08552">arXiv:2411.08552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08552">pdf</a>, <a href="https://arxiv.org/format/2411.08552">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with Variational Quantum Circuits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jun Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chao-Han Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+P">Pin-Yu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zenil%2C+H">Hector Zenil</a>, <a href="/search/cs?searchtype=author&amp;query=Tegner%2C+J">Jesper Tegner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08552v1-abstract-short" style="display: inline;"> Quantum Machine Learning (QML) offers tremendous potential but is currently limited by the availability of qubits. We introduce an innovative approach that utilizes pre-trained neural networks to enhance Variational Quantum Circuits (VQC). This technique effectively separates approximation error from qubit count and removes the need for restrictive conditions, making QML more viable for real-world&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08552v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08552v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08552v1-abstract-full" style="display: none;"> Quantum Machine Learning (QML) offers tremendous potential but is currently limited by the availability of qubits. We introduce an innovative approach that utilizes pre-trained neural networks to enhance Variational Quantum Circuits (VQC). This technique effectively separates approximation error from qubit count and removes the need for restrictive conditions, making QML more viable for real-world applications. Our method significantly improves parameter optimization for VQC while delivering notable gains in representation and generalization capabilities, as evidenced by rigorous theoretical analysis and extensive empirical testing on quantum dot classification tasks. Moreover, our results extend to applications such as human genome analysis, demonstrating the broad applicability of our approach. By addressing the constraints of current quantum hardware, our work paves the way for a new era of advanced QML applications, unlocking the full potential of quantum computing in fields such as machine learning, materials science, medicine, mimetics, and various interdisciplinary areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08552v1-abstract-full').style.display = 'none'; document.getElementById('2411.08552v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07773">arXiv:2411.07773</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07773">pdf</a>, <a href="https://arxiv.org/format/2411.07773">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Likelihood as a Performance Gauge for Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jirui Qi</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+P">Paul He</a>, <a href="/search/cs?searchtype=author&amp;query=Bisazza%2C+A">Arianna Bisazza</a>, <a href="/search/cs?searchtype=author&amp;query=Sachan%2C+M">Mrinmaya Sachan</a>, <a href="/search/cs?searchtype=author&amp;query=Cotterell%2C+R">Ryan Cotterell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07773v1-abstract-short" style="display: inline;"> Recent work finds that retrieval-augmented generation with large language models is prone to be influenced by the order of retrieved documents in the context. However, the lack of in-depth analysis limits the use of this phenomenon for prompt engineering in practice. In this study, we posit that likelihoods serve as an effective gauge for language model performance. Through experiments on two ques&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07773v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07773v1-abstract-full" style="display: none;"> Recent work finds that retrieval-augmented generation with large language models is prone to be influenced by the order of retrieved documents in the context. However, the lack of in-depth analysis limits the use of this phenomenon for prompt engineering in practice. In this study, we posit that likelihoods serve as an effective gauge for language model performance. Through experiments on two question-answering datasets with a variety of state-of-the-art language models, we reveal correlations between answer accuracy and the likelihood of the question at both the corpus level and the instance level. In addition, we find that question likelihood can also indicate the position of the task-relevant information in the context. Based on these findings, we propose two methods that use question likelihood as a gauge for selecting and constructing prompts that lead to better performance. We demonstrate their effectiveness with experiments. In addition, our likelihood-based methods are efficient, as they only need to compute the likelihood of the input, requiring much fewer language model passes than heuristic prompt engineering methods that require generating responses. Our analysis deepens our understanding of how input prompts affect model performance and provides a promising direction for efficient prompt optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07773v1-abstract-full').style.display = 'none'; document.getElementById('2411.07773v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review at NAACL 2025. Code is available at https://github.com/lyutyuh/poptimizer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07395">arXiv:2411.07395</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07395">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Data-Centric Learning Framework for Real-Time Detection of Aiming Beam in Fluorescence Lifetime Imaging Guided Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hassan%2C+M+A">Mohamed Abul Hassan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+P">Pu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xiangnan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Kraft%2C+L">Lisanne Kraft</a>, <a href="/search/cs?searchtype=author&amp;query=Hadfield%2C+K+T">Kelsey T Hadfield</a>, <a href="/search/cs?searchtype=author&amp;query=Ehrlich%2C+K">Katjana Ehrlich</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jinyi Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Birkeland%2C+A">Andrew Birkeland</a>, <a href="/search/cs?searchtype=author&amp;query=Marcu%2C+L">Laura Marcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07395v1-abstract-short" style="display: inline;"> This study introduces a novel data-centric approach to improve real-time surgical guidance using fiber-based fluorescence lifetime imaging (FLIm). A key aspect of the methodology is the accurate detection of the aiming beam, which is essential for localizing points used to map FLIm measurements onto the tissue region within the surgical field. The primary challenge arises from the complex and vari&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07395v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07395v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07395v1-abstract-full" style="display: none;"> This study introduces a novel data-centric approach to improve real-time surgical guidance using fiber-based fluorescence lifetime imaging (FLIm). A key aspect of the methodology is the accurate detection of the aiming beam, which is essential for localizing points used to map FLIm measurements onto the tissue region within the surgical field. The primary challenge arises from the complex and variable conditions encountered in the surgical environment, particularly in Transoral Robotic Surgery (TORS). Uneven illumination in the surgical field can cause reflections, reduce contrast, and results in inconsistent color representation, further complicating aiming beam detection. To overcome these challenges, an instance segmentation model was developed using a data-centric training strategy that improves accuracy by minimizing label noise and enhancing detection robustness. The model was evaluated on a dataset comprising 40 in vivo surgical videos, demonstrating a median detection rate of 85%. This performance was maintained when the model was integrated in a clinical system, achieving a similar detection rate of 85% during TORS procedures conducted in patients. The system&#39;s computational efficiency, measured at approximately 24 frames per second (FPS), was sufficient for real-time surgical guidance. This study enhances the reliability of FLIm-based aiming beam detection in complex surgical environments, advancing the feasibility of real-time, image-guided interventions for improved surgical precision <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07395v1-abstract-full').style.display = 'none'; document.getElementById('2411.07395v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24018">arXiv:2410.24018</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.24018">pdf</a>, <a href="https://arxiv.org/format/2410.24018">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bayesian-guided Label Mapping for Visual Reprogramming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+C">Chengyi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Z">Zesheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+L">Lei Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24018v1-abstract-short" style="display: inline;"> Visual reprogramming (VR) leverages the intrinsic capabilities of pretrained vision models by adapting their input or output interfaces to solve downstream tasks whose labels (i.e., downstream labels) might be totally different from the labels associated with the pretrained models (i.e., pretrained labels). When adapting the output interface, label mapping methods transform the pretrained labels t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24018v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24018v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24018v1-abstract-full" style="display: none;"> Visual reprogramming (VR) leverages the intrinsic capabilities of pretrained vision models by adapting their input or output interfaces to solve downstream tasks whose labels (i.e., downstream labels) might be totally different from the labels associated with the pretrained models (i.e., pretrained labels). When adapting the output interface, label mapping methods transform the pretrained labels to downstream labels by establishing a gradient-free one-to-one correspondence between the two sets of labels. However, in this paper, we reveal that one-to-one mappings may overlook the complex relationship between pretrained and downstream labels. Motivated by this observation, we propose a Bayesian-guided Label Mapping (BLM) method. BLM constructs an iteratively-updated probabilistic label mapping matrix, with each element quantifying a pairwise relationship between pretrained and downstream labels. The assignment of values to the constructed matrix is guided by Bayesian conditional probability, considering the joint distribution of the downstream labels and the labels predicted by the pretrained model on downstream samples. Experiments conducted on both pretrained vision models (e.g., ResNeXt) and vision-language models (e.g., CLIP) demonstrate the superior performance of BLM over existing label mapping methods. The success of BLM also offers a probabilistic lens through which to understand and analyze the effectiveness of VR. Our code is available at https://github.com/tmlr-group/BayesianLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24018v1-abstract-full').style.display = 'none'; document.getElementById('2410.24018v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18610">arXiv:2410.18610</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18610">pdf</a>, <a href="https://arxiv.org/format/2410.18610">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Joint Representation Using Continuous and Discrete Features for Cardiovascular Diseases Risk Prediction on Chest CT Scans </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Minfeng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+C">Chen-Chen Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+W">Wenchao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Pan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chao%2C+H">Hanqing Chao</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+K">Kunlun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18610v2-abstract-short" style="display: inline;"> Cardiovascular diseases (CVD) remain a leading health concern and contribute significantly to global mortality rates. While clinical advancements have led to a decline in CVD mortality, accurately identifying individuals who could benefit from preventive interventions remains an unsolved challenge in preventive cardiology. Current CVD risk prediction models, recommended by guidelines, are based on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18610v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18610v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18610v2-abstract-full" style="display: none;"> Cardiovascular diseases (CVD) remain a leading health concern and contribute significantly to global mortality rates. While clinical advancements have led to a decline in CVD mortality, accurately identifying individuals who could benefit from preventive interventions remains an unsolved challenge in preventive cardiology. Current CVD risk prediction models, recommended by guidelines, are based on limited traditional risk factors or use CT imaging to acquire quantitative biomarkers, and still have limitations in predictive accuracy and applicability. On the other hand, end-to-end trained CVD risk prediction methods leveraging deep learning on CT images often fail to provide transparent and explainable decision grounds for assisting physicians. In this work, we proposed a novel joint representation that integrates discrete quantitative biomarkers and continuous deep features extracted from chest CT scans. Our approach initiated with a deep CVD risk classification model by capturing comprehensive continuous deep learning features while jointly obtaining currently clinical-established quantitative biomarkers via segmentation models. In the feature joint representation stage, we use an instance-wise feature-gated mechanism to align the continuous and discrete features, followed by a soft instance-wise feature interaction mechanism fostering independent and effective feature interaction for the final CVD risk prediction. Our method substantially improves CVD risk predictive performance and offers individual contribution analysis of each biomarker, which is important in assisting physicians&#39; decision-making processes. We validated our method on a public chest low-dose CT dataset and a private external chest standard-dose CT patient cohort of 17,207 CT volumes from 6,393 unique subjects, and demonstrated superior predictive performance, achieving AUCs of 0.875 and 0.843, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18610v2-abstract-full').style.display = 'none'; document.getElementById('2410.18610v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14881">arXiv:2410.14881</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.14881">pdf</a>, <a href="https://arxiv.org/format/2410.14881">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Class-RAG: Content Moderation with Retrieval Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jianfa Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+E">Emily Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Bavalatti%2C+T">Trupti Bavalatti</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+X">Xiaowen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yongkai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shuming Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Subramanyam%2C+H">Harihar Subramanyam</a>, <a href="/search/cs?searchtype=author&amp;query=Vepuri%2C+K+S">Ksheeraj Sai Vepuri</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+M">Ming Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+A">Ankit Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14881v1-abstract-short" style="display: inline;"> Robust content moderation classifiers are essential for the safety of Generative AI systems. Content moderation, or safety classification, is notoriously ambiguous: differences between safe and unsafe inputs are often extremely subtle, making it difficult for classifiers (and indeed, even humans) to properly distinguish violating vs. benign samples without further context or explanation. Furthermo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14881v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14881v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14881v1-abstract-full" style="display: none;"> Robust content moderation classifiers are essential for the safety of Generative AI systems. Content moderation, or safety classification, is notoriously ambiguous: differences between safe and unsafe inputs are often extremely subtle, making it difficult for classifiers (and indeed, even humans) to properly distinguish violating vs. benign samples without further context or explanation. Furthermore, as these technologies are deployed across various applications and audiences, scaling risk discovery and mitigation through continuous model fine-tuning becomes increasingly challenging and costly. To address these challenges, we propose a Classification approach employing Retrieval-Augmented Generation (Class-RAG). Class-RAG extends the capability of its base LLM through access to a retrieval library which can be dynamically updated to enable semantic hotfixing for immediate, flexible risk mitigation. Compared to traditional fine-tuned models, Class-RAG demonstrates flexibility and transparency in decision-making. As evidenced by empirical studies, Class-RAG outperforms on classification and is more robust against adversarial attack. Besides, our findings suggest that Class-RAG performance scales with retrieval library size, indicating that increasing the library size is a viable and low-cost approach to improve content moderation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14881v1-abstract-full').style.display = 'none'; document.getElementById('2410.14881v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, submit to ACL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12846">arXiv:2410.12846</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12846">pdf</a>, <a href="https://arxiv.org/format/2410.12846">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Accurate and Regret-aware Numerical Problem Solver for Tabular Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuxiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+J">Junhao Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12846v1-abstract-short" style="display: inline;"> Question answering on free-form tables (a.k.a. TableQA) is a challenging task because of the flexible structure and the complex schema of tables. Recent studies use Large Language Models (LLMs) for this task, exploiting their capability in understanding the questions and tabular data which are typically given in natural language and contains many textual fields, respectively. While this approach h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12846v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12846v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12846v1-abstract-full" style="display: none;"> Question answering on free-form tables (a.k.a. TableQA) is a challenging task because of the flexible structure and the complex schema of tables. Recent studies use Large Language Models (LLMs) for this task, exploiting their capability in understanding the questions and tabular data which are typically given in natural language and contains many textual fields, respectively. While this approach has shown promising results, it overlooks the challenges brought by numerical values which are common in tabular data, while LLMs are known to struggle with such values. We aim to address this issue and answer numerical questions. We propose a model named TabLaP that uses LLMs as a planner rather than an answer generator, exploiting LLMs capability in multi-step reasoning while leaving the actual numerical calculations to a Python interpreter for accurate calculation. Recognizing the inaccurate nature of LLMs, we further make a first attempt to quantify the trustworthiness of the answers produced by TabLaP, such that users can use TabLaP in a regret-aware manner. Experimental results on two benchmark datasets show that TabLaP is substantially more accurate than the state-of-the-art models, improving the answer accuracy by 5.7% and 5.8% on the two datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12846v1-abstract-full').style.display = 'none'; document.getElementById('2410.12846v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08876">arXiv:2410.08876</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08876">pdf</a>, <a href="https://arxiv.org/format/2410.08876">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RoRA-VLM: Robust Retrieval-Augmented Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jingyuan Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhiyang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+R">Rulin Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Di%2C+J">Jin Di</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+L">Lifu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08876v2-abstract-short" style="display: inline;"> Current vision-language models (VLMs) still exhibit inferior performance on knowledge-intensive tasks, primarily due to the challenge of accurately encoding all the associations between visual objects and scenes to their corresponding entities and background knowledge. While retrieval augmentation methods offer an efficient way to integrate external knowledge, extending them to vision-language dom&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08876v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08876v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08876v2-abstract-full" style="display: none;"> Current vision-language models (VLMs) still exhibit inferior performance on knowledge-intensive tasks, primarily due to the challenge of accurately encoding all the associations between visual objects and scenes to their corresponding entities and background knowledge. While retrieval augmentation methods offer an efficient way to integrate external knowledge, extending them to vision-language domain presents unique challenges in (1) precisely retrieving relevant information from external sources due to the inherent discrepancy within the multimodal queries, and (2) being resilient to the irrelevant, extraneous and noisy information contained in the retrieved multimodal knowledge snippets. In this work, we introduce RORA-VLM, a novel and robust retrieval augmentation framework specifically tailored for VLMs, with two key innovations: (1) a 2-stage retrieval process with image-anchored textual-query expansion to synergistically combine the visual and textual information in the query and retrieve the most relevant multimodal knowledge snippets; and (2) a robust retrieval augmentation method that strengthens the resilience of VLMs against irrelevant information in the retrieved multimodal knowledge by injecting adversarial noises into the retrieval-augmented training process, and filters out extraneous visual information, such as unrelated entities presented in images, via a query-oriented visual token refinement strategy. We conduct extensive experiments to validate the effectiveness and robustness of our proposed methods on three widely adopted benchmark datasets. Our results demonstrate that with a minimal amount of training instance, RORA-VLM enables the base model to achieve significant performance improvement and constantly outperform state-of-the-art retrieval-augmented VLMs on all benchmarks while also exhibiting a novel zero-shot domain transfer capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08876v2-abstract-full').style.display = 'none'; document.getElementById('2410.08876v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08249">arXiv:2410.08249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08249">pdf</a>, <a href="https://arxiv.org/format/2410.08249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federated Graph Learning for Cross-Domain Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Ziqi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhaopeng Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zihui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chaochao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+W">Weike Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+C">Chenglu Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoliang Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08249v2-abstract-short" style="display: inline;"> Cross-domain recommendation (CDR) offers a promising solution to the data sparsity problem by enabling knowledge transfer across source and target domains. However, many recent CDR models overlook crucial issues such as privacy as well as the risk of negative transfer (which negatively impact model performance), especially in multi-domain settings. To address these challenges, we propose FedGCDR,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08249v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08249v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08249v2-abstract-full" style="display: none;"> Cross-domain recommendation (CDR) offers a promising solution to the data sparsity problem by enabling knowledge transfer across source and target domains. However, many recent CDR models overlook crucial issues such as privacy as well as the risk of negative transfer (which negatively impact model performance), especially in multi-domain settings. To address these challenges, we propose FedGCDR, a novel federated graph learning framework that securely and effectively leverages positive knowledge from multiple source domains. First, we design a positive knowledge transfer module that ensures privacy during inter-domain knowledge transmission. This module employs differential privacy-based knowledge extraction combined with a feature mapping mechanism, transforming source domain embeddings from federated graph attention networks into reliable domain knowledge. Second, we design a knowledge activation module to filter out potential harmful or conflicting knowledge from source domains, addressing the issues of negative transfer. This module enhances target domain training by expanding the graph of the target domain to generate reliable domain attentions and fine-tunes the target model for improved negative knowledge filtering and more accurate predictions. We conduct extensive experiments on 16 popular domains of the Amazon dataset, demonstrating that FedGCDR significantly outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08249v2-abstract-full').style.display = 'none'; document.getElementById('2410.08249v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS&#39;24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08048">arXiv:2410.08048</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08048">pdf</a>, <a href="https://arxiv.org/format/2410.08048">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VerifierQ: Enhancing LLM Test Time Compute with Q-Learning-based Verifiers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhigang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08048v1-abstract-short" style="display: inline;"> Recent advancements in test time compute, particularly through the use of verifier models, have significantly enhanced the reasoning capabilities of Large Language Models (LLMs). This generator-verifier approach closely resembles the actor-critic framework in reinforcement learning (RL). However, current verifier models in LLMs often rely on supervised fine-tuning without temporal difference learn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08048v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08048v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08048v1-abstract-full" style="display: none;"> Recent advancements in test time compute, particularly through the use of verifier models, have significantly enhanced the reasoning capabilities of Large Language Models (LLMs). This generator-verifier approach closely resembles the actor-critic framework in reinforcement learning (RL). However, current verifier models in LLMs often rely on supervised fine-tuning without temporal difference learning such as Q-learning. This paper introduces VerifierQ, a novel approach that integrates Offline Q-learning into LLM verifier models. We address three key challenges in applying Q-learning to LLMs: (1) handling utterance-level Markov Decision Processes (MDPs), (2) managing large action spaces, and (3) mitigating overestimation bias. VerifierQ introduces a modified Bellman update for bounded Q-values, incorporates Implicit Q-learning (IQL) for efficient action space management, and integrates a novel Conservative Q-learning (CQL) formulation for balanced Q-value estimation. Our method enables parallel Q-value computation and improving training efficiency. While recent work has explored RL techniques like MCTS for generators, VerifierQ is among the first to investigate the verifier (critic) aspect in LLMs through Q-learning. This integration of RL principles into verifier models complements existing advancements in generator techniques, potentially enabling more robust and adaptive reasoning in LLMs. Experimental results on mathematical reasoning tasks demonstrate VerifierQ&#39;s superior performance compared to traditional supervised fine-tuning approaches, with improvements in efficiency, accuracy and robustness. By enhancing the synergy between generation and evaluation capabilities, VerifierQ contributes to the ongoing evolution of AI systems in addressing complex cognitive tasks across various domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08048v1-abstract-full').style.display = 'none'; document.getElementById('2410.08048v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06115">arXiv:2410.06115</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06115">pdf</a>, <a href="https://arxiv.org/format/2410.06115">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A physics-based perspective for understanding and utilizing spatial resources of wireless channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hui Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J+W">Jun Wei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Z+J">Zhen Jie Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H+T">Hao Tian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+R+W">Rui Wen Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Q">Qiang Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jieao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+L">Linglong Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+T+J">Tie Jun Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06115v1-abstract-short" style="display: inline;"> To satisfy the increasing demands for transmission rates of wireless communications, it is necessary to use spatial resources of electromagnetic (EM) waves. In this context, EM information theory (EIT) has become a hot topic by integrating the theoretical framework of deterministic mathematics and stochastic statistics to explore the transmission mechanisms of continuous EM waves. However, the pre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06115v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06115v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06115v1-abstract-full" style="display: none;"> To satisfy the increasing demands for transmission rates of wireless communications, it is necessary to use spatial resources of electromagnetic (EM) waves. In this context, EM information theory (EIT) has become a hot topic by integrating the theoretical framework of deterministic mathematics and stochastic statistics to explore the transmission mechanisms of continuous EM waves. However, the previous studies were primarily focused on frame analysis, with limited exploration of practical applications and a comprehensive understanding of its essential physical characteristics. In this paper, we present a three-dimensional (3-D) line-of-sight channel capacity formula that captures the vector EM physics and accommodates both near- and far-field scenes. Based on the rigorous mathematical equation and the physical mechanism of fast multipole expansion, a channel model is established, and the finite angular spectral bandwidth feature of scattered waves is revealed. To adapt to the feature of the channel, an optimization problem is formulated for determining the mode currents on the transmitter, aiming to obtain the optimal design of the precoder and combiner. We make comprehensive analyses to investigate the relationship among the spatial degree of freedom, noise, and transmitted power, thereby establishing a rigorous upper bound of channel capacity. A series of simulations are conducted to validate the theoretical model and numerical method. This work offers a novel perspective and methodology for understanding and leveraging EIT, and provides a theoretical foundation for the design and optimization of future wireless communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06115v1-abstract-full').style.display = 'none'; document.getElementById('2410.06115v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05731">arXiv:2410.05731</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05731">pdf</a>, <a href="https://arxiv.org/format/2410.05731">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing SPARQL Generation by Triplet-order-sensitive Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Su%2C+C">Chang Su</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiexing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+H">He Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+K">Kai Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhouhan Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05731v1-abstract-short" style="display: inline;"> Semantic parsing that translates natural language queries to SPARQL is of great importance for Knowledge Graph Question Answering (KGQA) systems. Although pre-trained language models like T5 have achieved significant success in the Text-to-SPARQL task, their generated outputs still exhibit notable errors specific to the SPARQL language, such as triplet flips. To address this challenge and further&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05731v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05731v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05731v1-abstract-full" style="display: none;"> Semantic parsing that translates natural language queries to SPARQL is of great importance for Knowledge Graph Question Answering (KGQA) systems. Although pre-trained language models like T5 have achieved significant success in the Text-to-SPARQL task, their generated outputs still exhibit notable errors specific to the SPARQL language, such as triplet flips. To address this challenge and further improve the performance, we propose an additional pre-training stage with a new objective, Triplet Order Correction (TOC), along with the commonly used Masked Language Modeling (MLM), to collectively enhance the model&#39;s sensitivity to triplet order and SPARQL syntax. Our method achieves state-of-the-art performances on three widely-used benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05731v1-abstract-full').style.display = 'none'; document.getElementById('2410.05731v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by CIKM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05240">arXiv:2409.05240</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05240">pdf</a>, <a href="https://arxiv.org/format/2409.05240">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> A Physics-Enforced Neural Network to Predict Polymer Melt Viscosity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jain%2C+A">Ayush Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Gurnani%2C+R">Rishi Gurnani</a>, <a href="/search/cs?searchtype=author&amp;query=Rajan%2C+A">Arunkumar Rajan</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+H+J">H. Jerry Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Ramprasad%2C+R">Rampi Ramprasad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05240v1-abstract-short" style="display: inline;"> Achieving superior polymeric components through additive manufacturing (AM) relies on precise control of rheology. One key rheological property particularly relevant to AM is melt viscosity ($畏$). Melt viscosity is influenced by polymer chemistry, molecular weight ($M_w$), polydispersity, induced shear rate ($\dot纬$), and processing temperature ($T$). The relationship of $畏$ with $M_w$, $\dot纬$, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05240v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05240v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05240v1-abstract-full" style="display: none;"> Achieving superior polymeric components through additive manufacturing (AM) relies on precise control of rheology. One key rheological property particularly relevant to AM is melt viscosity ($畏$). Melt viscosity is influenced by polymer chemistry, molecular weight ($M_w$), polydispersity, induced shear rate ($\dot纬$), and processing temperature ($T$). The relationship of $畏$ with $M_w$, $\dot纬$, and $T$ may be captured by parameterized equations. Several physical experiments are required to fit the parameters, so predicting $畏$ of a new polymer material in unexplored physical domains is a laborious process. Here, we develop a Physics-Enforced Neural Network (PENN) model that predicts the empirical parameters and encodes the parametrized equations to calculate $畏$ as a function of polymer chemistry, $M_w$, polydispersity, $\dot纬$, and $T$. We benchmark our PENN against physics-unaware Artificial Neural Network (ANN) and Gaussian Process Regression (GPR) models. Finally, we demonstrate that the PENN offers superior values of $畏$ when extrapolating to unseen values of $M_w$, $\dot纬$, and $T$ for sparsely seen polymers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05240v1-abstract-full').style.display = 'none'; document.getElementById('2409.05240v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02828">arXiv:2409.02828</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02828">pdf</a>, <a href="https://arxiv.org/format/2409.02828">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ExpLLM: Towards Chain of Thought for Facial Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xing Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+J">Jian Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Ke Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02828v1-abstract-short" style="display: inline;"> Facial expression recognition (FER) is a critical task in multimedia with significant implications across various domains. However, analyzing the causes of facial expressions is essential for accurately recognizing them. Current approaches, such as those based on facial action units (AUs), typically provide AU names and intensities but lack insight into the interactions and relationships between A&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02828v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02828v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02828v1-abstract-full" style="display: none;"> Facial expression recognition (FER) is a critical task in multimedia with significant implications across various domains. However, analyzing the causes of facial expressions is essential for accurately recognizing them. Current approaches, such as those based on facial action units (AUs), typically provide AU names and intensities but lack insight into the interactions and relationships between AUs and the overall expression. In this paper, we propose a novel method called ExpLLM, which leverages large language models to generate an accurate chain of thought (CoT) for facial expression recognition. Specifically, we have designed the CoT mechanism from three key perspectives: key observations, overall emotional interpretation, and conclusion. The key observations describe the AU&#39;s name, intensity, and associated emotions. The overall emotional interpretation provides an analysis based on multiple AUs and their interactions, identifying the dominant emotions and their relationships. Finally, the conclusion presents the final expression label derived from the preceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed to construct this expression CoT and generate instruction-description data for training our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets demonstrate that ExpLLM outperforms current state-of-the-art FER methods. ExpLLM also surpasses the latest GPT-4o in expression CoT generation, particularly in recognizing micro-expressions where GPT-4o frequently fails. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02828v1-abstract-full').style.display = 'none'; document.getElementById('2409.02828v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://starhiking.github.io/ExpLLM_Page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16500">arXiv:2408.16500</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.16500">pdf</a>, <a href="https://arxiv.org/format/2408.16500">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CogVLM2: Visual Language Models for Image and Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hong%2C+W">Wenyi Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weihan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenmeng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+Q">Qingsong Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yean Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shiyu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+J">Junhui Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+Z">Zhao Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhuoyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xiaotao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaohan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+G">Guanyu Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xixuan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Debing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Juanzi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16500v1-abstract-short" style="display: inline;"> Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in pursuit of enhanced vision-language fusion, efficient higher-resolution architecture, and broader modalities and applications. Here we propose the CogVLM2 family, a new generation of visual language models for image and video understanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image understanding model, CogVLM2&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16500v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16500v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16500v1-abstract-full" style="display: none;"> Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in pursuit of enhanced vision-language fusion, efficient higher-resolution architecture, and broader modalities and applications. Here we propose the CogVLM2 family, a new generation of visual language models for image and video understanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image understanding model, CogVLM2 inherits the visual expert architecture with improved training recipes in both pre-training and post-training stages, supporting input resolution up to $1344 \times 1344$ pixels. As a video understanding model, CogVLM2-Video integrates multi-frame input with timestamps and proposes automated temporal grounding data construction. Notably, CogVLM2 family has achieved state-of-the-art results on benchmarks like MMBench, MM-Vet, TextVQA, MVBench and VCGBench. All models are open-sourced in https://github.com/THUDM/CogVLM2 and https://github.com/THUDM/GLM-4, contributing to the advancement of the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16500v1-abstract-full').style.display = 'none'; document.getElementById('2408.16500v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12003">arXiv:2408.12003</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12003">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RAG-Optimized Tibetan Tourism LLMs: Enhancing Accuracy and Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jinhu Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shuai Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+R">Rong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yuwei Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Ke Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12003v1-abstract-short" style="display: inline;"> With the development of the modern social economy, tourism has become an important way to meet people&#39;s spiritual needs, bringing development opportunities to the tourism industry. However, existing large language models (LLMs) face challenges in personalized recommendation capabilities and the generation of content that can sometimes produce hallucinations. This study proposes an optimization sch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12003v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12003v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12003v1-abstract-full" style="display: none;"> With the development of the modern social economy, tourism has become an important way to meet people&#39;s spiritual needs, bringing development opportunities to the tourism industry. However, existing large language models (LLMs) face challenges in personalized recommendation capabilities and the generation of content that can sometimes produce hallucinations. This study proposes an optimization scheme for Tibet tourism LLMs based on retrieval-augmented generation (RAG) technology. By constructing a database of tourist viewpoints and processing the data using vectorization techniques, we have significantly improved retrieval accuracy. The application of RAG technology effectively addresses the hallucination problem in content generation. The optimized model shows significant improvements in fluency, accuracy, and relevance of content generation. This research demonstrates the potential of RAG technology in the standardization of cultural tourism information and data analysis, providing theoretical and technical support for the development of intelligent cultural tourism service systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12003v1-abstract-full').style.display = 'none'; document.getElementById('2408.12003v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AIPR 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06134">arXiv:2408.06134</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.06134">pdf</a>, <a href="https://arxiv.org/format/2408.06134">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Learned Indexes with Distribution Smoothing via Virtual Points </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Amarasinghe%2C+K">Kasun Amarasinghe</a>, <a href="/search/cs?searchtype=author&amp;query=Choudhury%2C+F">Farhana Choudhury</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Bailey%2C+J">James Bailey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06134v2-abstract-short" style="display: inline;"> Recent research on learned indexes has created a new perspective for indexes as models that map keys to their respective storage locations. These learned indexes are created to approximate the cumulative distribution function of the key set, where using only a single model may have limited accuracy. To overcome this limitation, a typical method is to use multiple models, arranged in a hierarchical&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06134v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06134v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06134v2-abstract-full" style="display: none;"> Recent research on learned indexes has created a new perspective for indexes as models that map keys to their respective storage locations. These learned indexes are created to approximate the cumulative distribution function of the key set, where using only a single model may have limited accuracy. To overcome this limitation, a typical method is to use multiple models, arranged in a hierarchical manner, where the query performance depends on two aspects: (i) traversal time to find the correct model and (ii) search time to find the key in the selected model. Such a method may cause some key space regions that are difficult to model to be placed at deeper levels in the hierarchy. To address this issue, we propose an alternative method that modifies the key space as opposed to any structural or model modifications. This is achieved through making the key set more learnable (i.e., smoothing the distribution) by inserting virtual points. Furthermore, we develop an algorithm named CSV to integrate our virtual point insertion method into existing learned indexes, reducing both their traversal and search time. We implement CSV on state-of-the-art learned indexes and evaluate them on real-world datasets. Extensive experimental results show significant query performance improvement for the keys in deeper levels of the index structures at a low storage cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06134v2-abstract-full').style.display = 'none'; document.getElementById('2408.06134v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16716">arXiv:2407.16716</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.16716">pdf</a>, <a href="https://arxiv.org/ps/2407.16716">ps</a>, <a href="https://arxiv.org/format/2407.16716">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring The Neural Burden In Pruned Models: An Insight Inspired By Neuroscience </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+W">Weichen Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xiangyu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16716v2-abstract-short" style="display: inline;"> Vision Transformer and its variants have been adopted in many visual tasks due to their powerful capabilities, which also bring significant challenges in computation and storage. Consequently, researchers have introduced various compression methods in recent years, among which the pruning techniques are widely used to remove a significant fraction of the network. Therefore, these methods can reduc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16716v2-abstract-full').style.display = 'inline'; document.getElementById('2407.16716v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16716v2-abstract-full" style="display: none;"> Vision Transformer and its variants have been adopted in many visual tasks due to their powerful capabilities, which also bring significant challenges in computation and storage. Consequently, researchers have introduced various compression methods in recent years, among which the pruning techniques are widely used to remove a significant fraction of the network. Therefore, these methods can reduce significant percent of the FLOPs, but often lead to a decrease in model performance. To investigate the underlying causes, we focus on the pruning methods specifically belonging to the pruning-during-training category, then drew inspiration from neuroscience and propose a new concept for artificial neural network models named Neural Burden. We investigate its impact in the model pruning process, and subsequently explore a simple yet effective approach to mitigate the decline in model performance, which can be applied to any pruning-during-training technique. Extensive experiments indicate that the neural burden phenomenon indeed exists, and show the potential of our method. We hope that our findings can provide valuable insights for future research. Code will be made publicly available after this paper is published. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16716v2-abstract-full').style.display = 'none'; document.getElementById('2407.16716v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13561">arXiv:2407.13561</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13561">pdf</a>, <a href="https://arxiv.org/format/2407.13561">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Research on Tibetan Tourism Viewpoints information generation system based on LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jinhu Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shuai Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Ke Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13561v1-abstract-short" style="display: inline;"> Tibet, ensconced within China&#39;s territorial expanse, is distinguished by its labyrinthine and heterogeneous topography, a testament to its profound historical heritage, and the cradle of a unique religious ethos. The very essence of these attributes, however, has impeded the advancement of Tibet&#39;s tourism service infrastructure, rendering existing smart tourism services inadequate for the region&#39;s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13561v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13561v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13561v1-abstract-full" style="display: none;"> Tibet, ensconced within China&#39;s territorial expanse, is distinguished by its labyrinthine and heterogeneous topography, a testament to its profound historical heritage, and the cradle of a unique religious ethos. The very essence of these attributes, however, has impeded the advancement of Tibet&#39;s tourism service infrastructure, rendering existing smart tourism services inadequate for the region&#39;s visitors. This study delves into the ramifications of informational disparities at tourist sites on Tibetan tourism and addresses the challenge of establishing the Large Language Model (LLM) evaluation criteria. It introduces an innovative approach, the DualGen Bridge AI system, employing supervised fine-tuning techniques to bolster model functionality and enhance optimization processes. Furthermore, it pioneers a multi-structured generative results assessment framework. Empirical validation confirms the efficacy of this framework. The study also explores the application of the supervised fine-tuning method within the proprietary DualGen Bridge AI, aimed at refining the generation of tourist site information. The study&#39;s findings offer valuable insights for optimizing system performance and provide support and inspiration for the application of LLM technology in Tibet&#39;s tourism services and beyond, potentially revolutionizing the smart tourism industry with advanced, tailored information generation capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13561v1-abstract-full').style.display = 'none'; document.getElementById('2407.13561v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICWOC 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00056">arXiv:2407.00056</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.00056">pdf</a>, <a href="https://arxiv.org/format/2407.00056">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> MMBee: Live Streaming Gift-Sending Recommendations via Multi-Modal Fusion and Behaviour Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jiaxin Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuchen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiansong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Liqin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+G">Guorui Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+G">Gaofeng Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00056v1-abstract-short" style="display: inline;"> Live streaming services are becoming increasingly popular due to real-time interactions and entertainment. Viewers can chat and send comments or virtual gifts to express their preferences for the streamers. Accurately modeling the gifting interaction not only enhances users&#39; experience but also increases streamers&#39; revenue. Previous studies on live streaming gifting prediction treat this task as a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00056v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00056v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00056v1-abstract-full" style="display: none;"> Live streaming services are becoming increasingly popular due to real-time interactions and entertainment. Viewers can chat and send comments or virtual gifts to express their preferences for the streamers. Accurately modeling the gifting interaction not only enhances users&#39; experience but also increases streamers&#39; revenue. Previous studies on live streaming gifting prediction treat this task as a conventional recommendation problem, and model users&#39; preferences using categorical data and observed historical behaviors. However, it is challenging to precisely describe the real-time content changes in live streaming using limited categorical information. Moreover, due to the sparsity of gifting behaviors, capturing the preferences and intentions of users is quite difficult. In this work, we propose MMBee based on real-time Multi-Modal Fusion and Behaviour Expansion to address these issues. Specifically, we first present a Multi-modal Fusion Module with Learnable Query (MFQ) to perceive the dynamic content of streaming segments and process complex multi-modal interactions, including images, text comments and speech. To alleviate the sparsity issue of gifting behaviors, we present a novel Graph-guided Interest Expansion (GIE) approach that learns both user and streamer representations on large-scale gifting graphs with multi-modal attributes. Comprehensive experiment results show that MMBee achieves significant performance improvements on both public datasets and Kuaishou real-world streaming datasets and the effectiveness has been further validated through online A/B experiments. MMBee has been deployed and is serving hundreds of millions of users at Kuaishou. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00056v1-abstract-full').style.display = 'none'; document.getElementById('2407.00056v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at KDD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19999">arXiv:2406.19999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19999">pdf</a>, <a href="https://arxiv.org/format/2406.19999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The SIFo Benchmark: Investigating the Sequential Instruction Following Ability of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+B">Baohao Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jirui Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Eustratiadis%2C+P">Panagiotis Eustratiadis</a>, <a href="/search/cs?searchtype=author&amp;query=Monz%2C+C">Christof Monz</a>, <a href="/search/cs?searchtype=author&amp;query=Bisazza%2C+A">Arianna Bisazza</a>, <a href="/search/cs?searchtype=author&amp;query=de+Rijke%2C+M">Maarten de Rijke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19999v2-abstract-short" style="display: inline;"> Following multiple instructions is a crucial ability for large language models (LLMs). Evaluating this ability comes with significant challenges: (i) limited coherence between multiple instructions, (ii) positional bias where the order of instructions affects model performance, and (iii) a lack of objectively verifiable tasks. To address these issues, we introduce a benchmark designed to evaluate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19999v2-abstract-full').style.display = 'inline'; document.getElementById('2406.19999v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19999v2-abstract-full" style="display: none;"> Following multiple instructions is a crucial ability for large language models (LLMs). Evaluating this ability comes with significant challenges: (i) limited coherence between multiple instructions, (ii) positional bias where the order of instructions affects model performance, and (iii) a lack of objectively verifiable tasks. To address these issues, we introduce a benchmark designed to evaluate models&#39; abilities to follow multiple instructions through sequential instruction following (SIFo) tasks. In SIFo, the successful completion of multiple instructions is verifiable by examining only the final instruction. Our benchmark evaluates instruction following using four tasks (text modification, question answering, mathematics, and security rules), each assessing different aspects of sequential instruction following. Our evaluation of popular LLMs, both closed-source and open-source, shows that more recent and larger models significantly outperform their older and smaller counterparts on the SIFo tasks, validating the benchmark&#39;s effectiveness. All models struggle with following sequences of instructions, hinting at an important lack of robustness of today&#39;s language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19999v2-abstract-full').style.display = 'none'; document.getElementById('2406.19999v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14709">arXiv:2406.14709</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.14709">pdf</a>, <a href="https://arxiv.org/format/2406.14709">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Factual Dialogue Summarization via Learning from Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+R">Rongxin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Lau%2C+J+H">Jey Han Lau</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14709v1-abstract-short" style="display: inline;"> Factual consistency is an important quality in dialogue summarization. Large language model (LLM)-based automatic text summarization models generate more factually consistent summaries compared to those by smaller pretrained language models, but they face deployment challenges in real-world applications due to privacy or resource constraints. In this paper, we investigate the use of symbolic knowl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14709v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14709v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14709v1-abstract-full" style="display: none;"> Factual consistency is an important quality in dialogue summarization. Large language model (LLM)-based automatic text summarization models generate more factually consistent summaries compared to those by smaller pretrained language models, but they face deployment challenges in real-world applications due to privacy or resource constraints. In this paper, we investigate the use of symbolic knowledge distillation to improve the factual consistency of smaller pretrained models for dialogue summarization. We employ zero-shot learning to extract symbolic knowledge from LLMs, generating both factually consistent (positive) and inconsistent (negative) summaries. We then apply two contrastive learning objectives on these summaries to enhance smaller summarization models. Experiments with BART, PEGASUS, and Flan-T5 indicate that our approach surpasses strong baselines that rely on complex data augmentation strategies. Our approach achieves better factual consistency while maintaining coherence, fluency, and relevance, as confirmed by various automatic evaluation metrics. We also provide access to the data and code to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14709v1-abstract-full').style.display = 'none'; document.getElementById('2406.14709v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13663">arXiv:2406.13663</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13663">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model Internals-based Answer Attribution for Trustworthy Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jirui Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Sarti%2C+G">Gabriele Sarti</a>, <a href="/search/cs?searchtype=author&amp;query=Fern%C3%A1ndez%2C+R">Raquel Fern谩ndez</a>, <a href="/search/cs?searchtype=author&amp;query=Bisazza%2C+A">Arianna Bisazza</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13663v4-abstract-short" style="display: inline;"> Ensuring the verifiability of model answers is a fundamental challenge for retrieval-augmented generation (RAG) in the question answering (QA) domain. Recently, self-citation prompting was proposed to make large language models (LLMs) generate citations to supporting documents along with their answers. However, self-citing LLMs often struggle to match the required format, refer to non-existent sou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13663v4-abstract-full').style.display = 'inline'; document.getElementById('2406.13663v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13663v4-abstract-full" style="display: none;"> Ensuring the verifiability of model answers is a fundamental challenge for retrieval-augmented generation (RAG) in the question answering (QA) domain. Recently, self-citation prompting was proposed to make large language models (LLMs) generate citations to supporting documents along with their answers. However, self-citing LLMs often struggle to match the required format, refer to non-existent sources, and fail to faithfully reflect LLMs&#39; context usage throughout the generation. In this work, we present MIRAGE --Model Internals-based RAG Explanations -- a plug-and-play approach using model internals for faithful answer attribution in RAG applications. MIRAGE detects context-sensitive answer tokens and pairs them with retrieved documents contributing to their prediction via saliency methods. We evaluate our proposed approach on a multilingual extractive QA dataset, finding high agreement with human answer attribution. On open-ended QA, MIRAGE achieves citation quality and efficiency comparable to self-citation while also allowing for a finer-grained control of attribution parameters. Our qualitative evaluation highlights the faithfulness of MIRAGE&#39;s attributions and underscores the promising application of model internals for RAG answer attribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13663v4-abstract-full').style.display = 'none'; document.getElementById('2406.13663v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Main Conference. Code and data released at https://github.com/Betswish/MIRAGE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08035">arXiv:2406.08035</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08035">pdf</a>, <a href="https://arxiv.org/format/2406.08035">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LVBench: An Extreme Long Video Understanding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weihan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zehai He</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+W">Wenyi Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yean Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaohan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xiaotao Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shiyu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08035v2-abstract-short" style="display: inline;"> Recent progress in multimodal large language models has markedly enhanced the understanding of short videos (typically under one minute), and several evaluation datasets have emerged accordingly. However, these advancements fall short of meeting the demands of real-world applications such as embodied intelligence for long-term decision-making, in-depth movie reviews and discussions, and live sport&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08035v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08035v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08035v2-abstract-full" style="display: none;"> Recent progress in multimodal large language models has markedly enhanced the understanding of short videos (typically under one minute), and several evaluation datasets have emerged accordingly. However, these advancements fall short of meeting the demands of real-world applications such as embodied intelligence for long-term decision-making, in-depth movie reviews and discussions, and live sports commentary, all of which require comprehension of long videos spanning several hours. To address this gap, we introduce LVBench, a benchmark specifically designed for long video understanding. Our dataset comprises publicly sourced videos and encompasses a diverse set of tasks aimed at long video comprehension and information extraction. LVBench is designed to challenge multimodal models to demonstrate long-term memory and extended comprehension capabilities. Our extensive evaluations reveal that current multimodal models still underperform on these demanding long video understanding tasks. Through LVBench, we aim to spur the development of more advanced models capable of tackling the complexities of long video comprehension. Our data and code are publicly available at: https://lvbench.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08035v2-abstract-full').style.display = 'none'; document.getElementById('2406.08035v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07925">arXiv:2406.07925</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07925">pdf</a>, <a href="https://arxiv.org/format/2406.07925">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FDLoRA: Personalized Federated Learning of Large Language Model via Dual LoRA Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=QI%2C+J">Jiaxing QI</a>, <a href="/search/cs?searchtype=author&amp;query=Luan%2C+Z">Zhongzhi Luan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shaohan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Fung%2C+C">Carol Fung</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hailong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+D">Depei Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07925v1-abstract-short" style="display: inline;"> Large language models (LLMs) have emerged as important components across various fields, yet their training requires substantial computation resources and abundant labeled data. It poses a challenge to robustly training LLMs for individual users (clients). To tackle this challenge, the intuitive idea is to introduce federated learning (FL), which can collaboratively train models on distributed pri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07925v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07925v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07925v1-abstract-full" style="display: none;"> Large language models (LLMs) have emerged as important components across various fields, yet their training requires substantial computation resources and abundant labeled data. It poses a challenge to robustly training LLMs for individual users (clients). To tackle this challenge, the intuitive idea is to introduce federated learning (FL), which can collaboratively train models on distributed private data. However, existing methods suffer from the challenges of data heterogeneity, system heterogeneity, and model size, resulting in suboptimal performance and high costs. In this work, we proposed a variant of personalized federated learning (PFL) framework, namely FDLoRA, which allows the client to be a single device or a cluster and adopts low-rank adaptation (LoRA) tuning. FDLoRA sets dual LoRA modules on each client to capture personalized and global knowledge, respectively, and only the global LoRA module uploads parameters to the central server to aggregate cross-client knowledge. Finally, an adaptive fusion approach is employed to combine the parameters of the dual LoRAs. This enables FDLoRA to make effective use of private data distributed across different clients, thereby improving performance on the client without incurring high communication and computing costs. We conducted extensive experiments in two practice scenarios. The results demonstrate that FDLoRA outperforms six baselines in terms of performance, stability, robustness, computation cost, and communication cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07925v1-abstract-full').style.display = 'none'; document.getElementById('2406.07925v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03150">arXiv:2406.03150</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03150">pdf</a>, <a href="https://arxiv.org/format/2406.03150">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sample-specific Masks for Visual Reprogramming-based Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+C">Chengyi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Z">Zesheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+L">Lei Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03150v1-abstract-short" style="display: inline;"> Visual reprogramming (VR) is a prompting technique that aims to re-purpose a pre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g., medical data prediction) by learning a small-scale pattern added into input images instead of tuning considerable parameters within the model. The location of the pattern within input samples is usually determined by a pre-defined mask shared across&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03150v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03150v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03150v1-abstract-full" style="display: none;"> Visual reprogramming (VR) is a prompting technique that aims to re-purpose a pre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g., medical data prediction) by learning a small-scale pattern added into input images instead of tuning considerable parameters within the model. The location of the pattern within input samples is usually determined by a pre-defined mask shared across all samples. In this paper, we show that the shared mask potentially limits VR&#39;s generalization and increases its approximation error due to the lack of sample-level adaptation. Motivated by this finding, we design a new framework for VR called sample-specific multi-channel masks (SMM). Specifically, SMM employs a lightweight ConvNet and patch-wise interpolation to generate sample-specific three-channel masks instead of a shared and pre-defined mask. Since we generate different masks for individual samples, SMM is theoretically shown to reduce approximation error for the target tasks compared with existing state-of-the-art VR methods. We also empirically demonstrate its performance gain on both ResNet and ViT. The success of SMM further highlights the broader applicability of VR in leveraging the latent knowledge of pre-trained models for various target tasks. Our code is available at https://github.com/tmlr-group/SMM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03150v1-abstract-full').style.display = 'none'; document.getElementById('2406.03150v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03989">arXiv:2405.03989</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.03989">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> A Method for Parsing and Vectorization of Semi-structured Data used in Retrieval Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Jing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianchuan Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Jinliang Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Si Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Siqi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Ming Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03989v2-abstract-short" style="display: inline;"> This paper presents a novel method for parsing and vectorizing semi-structured data to enhance the functionality of Retrieval-Augmented Generation (RAG) within Large Language Models (LLMs). We developed a comprehensive pipeline for converting various data formats into .docx, enabling efficient parsing and structured data extraction. The core of our methodology involves the construction of a vector&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03989v2-abstract-full').style.display = 'inline'; document.getElementById('2405.03989v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03989v2-abstract-full" style="display: none;"> This paper presents a novel method for parsing and vectorizing semi-structured data to enhance the functionality of Retrieval-Augmented Generation (RAG) within Large Language Models (LLMs). We developed a comprehensive pipeline for converting various data formats into .docx, enabling efficient parsing and structured data extraction. The core of our methodology involves the construction of a vector database using Pinecone, which integrates seamlessly with LLMs to provide accurate, context-specific responses, particularly in environmental management and wastewater treatment operations. Through rigorous testing with both English and Chinese texts in diverse document formats, our results demonstrate a marked improvement in the precision and reliability of LLMs outputs. The RAG-enhanced models displayed enhanced ability to generate contextually rich and technically accurate responses, underscoring the potential of vector knowledge bases in significantly boosting the performance of LLMs in specialized domains. This research not only illustrates the effectiveness of our method but also highlights its potential to revolutionize data processing and analysis in environmental sciences, setting a precedent for future advancements in AI-driven applications. Our code is available at https://github.com/linancn/TianGong-AI-Unstructure.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03989v2-abstract-full').style.display = 'none'; document.getElementById('2405.03989v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages,4 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05091">arXiv:2404.05091</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.05091">pdf</a>, <a href="https://arxiv.org/format/2404.05091">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MM-MATH: Advancing Multimodal Math Evaluation with Process Evaluation and Fine-grained Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+K">Kai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yushi Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Juanzi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05091v4-abstract-short" style="display: inline;"> To advance the evaluation of multimodal math reasoning in large multimodal models (LMMs), this paper introduces a novel benchmark, MM-MATH. MM-MATH consists of 5,929 open-ended middle school math problems with visual contexts, with fine-grained classification across difficulty, grade level, and knowledge points. Unlike existing benchmarks relying on binary answer comparison, MM-MATH incorporates b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05091v4-abstract-full').style.display = 'inline'; document.getElementById('2404.05091v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05091v4-abstract-full" style="display: none;"> To advance the evaluation of multimodal math reasoning in large multimodal models (LMMs), this paper introduces a novel benchmark, MM-MATH. MM-MATH consists of 5,929 open-ended middle school math problems with visual contexts, with fine-grained classification across difficulty, grade level, and knowledge points. Unlike existing benchmarks relying on binary answer comparison, MM-MATH incorporates both outcome and process evaluations. Process evaluation employs LMM-as-a-judge to automatically analyze solution steps, identifying and categorizing errors into specific error types. Extensive evaluation of ten models on MM-MATH reveals significant challenges for existing LMMs, highlighting their limited utilization of visual information and struggles with higher-difficulty problems. The best-performing model achieves only 31% accuracy on MM-MATH, compared to 82% for humans. This highlights the challenging nature of our benchmark for existing models and the significant gap between the multimodal reasoning capabilities of current models and humans. Our process evaluation reveals that diagram misinterpretation is the most common error, accounting for more than half of the total error cases, underscoring the need for improved image comprehension in multimodal reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05091v4-abstract-full').style.display = 'none'; document.getElementById('2404.05091v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18282">arXiv:2403.18282</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.18282">pdf</a>, <a href="https://arxiv.org/format/2403.18282">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SGDM: Static-Guided Dynamic Module Make Stronger Visual Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+W">Wenjie Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Z">Zhenchao Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jing Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18282v1-abstract-short" style="display: inline;"> The spatial attention mechanism has been widely used to improve object detection performance. However, its operation is currently limited to static convolutions lacking content-adaptive features. This paper innovatively approaches from the perspective of dynamic convolution. We propose Razor Dynamic Convolution (RDConv) to address thetwo flaws in dynamic weight convolution, making it hard to imple&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18282v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18282v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18282v1-abstract-full" style="display: none;"> The spatial attention mechanism has been widely used to improve object detection performance. However, its operation is currently limited to static convolutions lacking content-adaptive features. This paper innovatively approaches from the perspective of dynamic convolution. We propose Razor Dynamic Convolution (RDConv) to address thetwo flaws in dynamic weight convolution, making it hard to implement in spatial mechanism: 1) it is computation-heavy; 2) when generating weights, spatial information is disregarded. Firstly, by using Razor Operation to generate certain features, we vastly reduce the parameters of the entire dynamic convolution operation. Secondly, we added a spatial branch inside RDConv to generate convolutional kernel parameters with richer spatial information. Embedding dynamic convolution will also bring the problem of sensitivity to high-frequency noise. We propose the Static-Guided Dynamic Module (SGDM) to address this limitation. By using SGDM, we utilize a set of asymmetric static convolution kernel parameters to guide the construction of dynamic convolution. We introduce the mechanism of shared weights in static convolution to solve the problem of dynamic convolution being sensitive to high-frequency noise. Extensive experiments illustrate that multiple different object detection backbones equipped with SGDM achieve a highly competitive boost in performance(e.g., +4% mAP with YOLOv5n on VOC and +1.7% mAP with YOLOv8n on COCO) with negligible parameter increase(i.e., +0.33M on YOLOv5n and +0.19M on YOLOv8n). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18282v1-abstract-full').style.display = 'none'; document.getElementById('2403.18282v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17448">arXiv:2403.17448</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.17448">pdf</a>, <a href="https://arxiv.org/format/2403.17448">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Line-Of-Sight guidance law based on vector fields path following for underactuated unmanned surface vehicle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jie Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Wanga%2C+R">Ronghua Wanga</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+N">Nailong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17448v2-abstract-short" style="display: inline;"> The focus of this paper is to develop a methodology that enables an unmanned surface vehicle (USV) to efficiently track a planned path. The introduction of a vector field-based adaptive line of-sight guidance law (VFALOS) for accurate trajectory tracking and minimizing the overshoot response time during USV tracking of curved paths improves the overall line-of-sight (LOS) guidance method. These im&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17448v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17448v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17448v2-abstract-full" style="display: none;"> The focus of this paper is to develop a methodology that enables an unmanned surface vehicle (USV) to efficiently track a planned path. The introduction of a vector field-based adaptive line of-sight guidance law (VFALOS) for accurate trajectory tracking and minimizing the overshoot response time during USV tracking of curved paths improves the overall line-of-sight (LOS) guidance method. These improvements contribute to faster convergence to the desired path, reduce oscillations, and can mitigate the effects of persistent external disturbances. It is shown that the proposed guidance law exhibits k-exponential stability when converging to the desired path consisting of straight and curved lines. The results in the paper show that the proposed method effectively improves the accuracy of the USV tracking the desired path while ensuring the safety of the USV work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17448v2-abstract-full').style.display = 'none'; document.getElementById('2403.17448v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10309">arXiv:2403.10309</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.10309">pdf</a>, <a href="https://arxiv.org/format/2403.10309">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Revolutionizing Packaging: A Robotic Bagging Pipeline with Constraint-aware Structure-of-Interest Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiaming Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+P">Peng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+P">Pai Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hongmin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chenguang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Navarro-Alarcon%2C+D">David Navarro-Alarcon</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10309v1-abstract-short" style="display: inline;"> Bagging operations, common in packaging and assisted living applications, are challenging due to a bag&#39;s complex deformable properties. To address this, we develop a robotic system for automated bagging tasks using an adaptive structure-of-interest (SOI) manipulation approach. Our method relies on real-time visual feedback to dynamically adjust manipulation without requiring prior knowledge of bag&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10309v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10309v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10309v1-abstract-full" style="display: none;"> Bagging operations, common in packaging and assisted living applications, are challenging due to a bag&#39;s complex deformable properties. To address this, we develop a robotic system for automated bagging tasks using an adaptive structure-of-interest (SOI) manipulation approach. Our method relies on real-time visual feedback to dynamically adjust manipulation without requiring prior knowledge of bag materials or dynamics. We present a robust pipeline featuring state estimation for SOIs using Gaussian Mixture Models (GMM), SOI generation via optimization-based bagging techniques, SOI motion planning with Constrained Bidirectional Rapidly-exploring Random Trees (CBiRRT), and dual-arm manipulation coordinated by Model Predictive Control (MPC). Experiments demonstrate the system&#39;s ability to achieve precise, stable bagging of various objects using adaptive coordination of the manipulators. The proposed framework advances the capability of dual-arm robots to perform more sophisticated automation of common tasks involving interactions with deformable objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10309v1-abstract-full').style.display = 'none'; document.getElementById('2403.10309v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02576">arXiv:2403.02576</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.02576">pdf</a>, <a href="https://arxiv.org/format/2403.02576">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> AceMap: Knowledge Discovery through Academic Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinbing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+L">Luoyi Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+X">Xiaoying Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Y">Ying Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+G">Guanjie Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+J">Jiaxin Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+L">Liyao Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+N">Nanyang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+M">Meng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+S">Shiyu Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+B">Bin Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haiwen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+C">Cheng Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+H">Huquan Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xingli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhixin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiexing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Pan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yuyang Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+L">Lyuwen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jungang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jianping Zhou</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02576v2-abstract-short" style="display: inline;"> The exponential growth of scientific literature requires effective management and extraction of valuable insights. While existing scientific search engines excel at delivering search results based on relational databases, they often neglect the analysis of collaborations between scientific entities and the evolution of ideas, as well as the in-depth analysis of content within scientific publicatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02576v2-abstract-full').style.display = 'inline'; document.getElementById('2403.02576v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02576v2-abstract-full" style="display: none;"> The exponential growth of scientific literature requires effective management and extraction of valuable insights. While existing scientific search engines excel at delivering search results based on relational databases, they often neglect the analysis of collaborations between scientific entities and the evolution of ideas, as well as the in-depth analysis of content within scientific publications. The representation of heterogeneous graphs and the effective measurement, analysis, and mining of such graphs pose significant challenges. To address these challenges, we present AceMap, an academic system designed for knowledge discovery through academic graph. We present advanced database construction techniques to build the comprehensive AceMap database with large-scale academic entities that contain rich visual, textual, and numerical information. AceMap also employs innovative visualization, quantification, and analysis methods to explore associations and logical relationships among academic entities. AceMap introduces large-scale academic network visualization techniques centered on nebular graphs, providing a comprehensive view of academic networks from multiple perspectives. In addition, AceMap proposes a unified metric based on structural entropy to quantitatively measure the knowledge content of different academic entities. Moreover, AceMap provides advanced analysis capabilities, including tracing the evolution of academic ideas through citation relationships and concept co-occurrence, and generating concise summaries informed by this evolutionary process. In addition, AceMap uses machine reading methods to generate potential new ideas at the intersection of different fields. Exploring the integration of large language models and knowledge graphs is a promising direction for future research in idea evolution. Please visit \url{https://www.acemap.info} for further exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02576v2-abstract-full').style.display = 'none'; document.getElementById('2403.02576v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report for AceMap (https://www.acemap.info)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01799">arXiv:2403.01799</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.01799">pdf</a>, <a href="https://arxiv.org/format/2403.01799">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Superpixel Graph Contrastive Clustering with Semantic-Invariant Augmentations for Hyperspectral Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianhan Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+J">Junhui Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01799v1-abstract-short" style="display: inline;"> Hyperspectral images (HSI) clustering is an important but challenging task. The state-of-the-art (SOTA) methods usually rely on superpixels, however, they do not fully utilize the spatial and spectral information in HSI 3-D structure, and their optimization targets are not clustering-oriented. In this work, we first use 3-D and 2-D hybrid convolutional neural networks to extract the high-order spa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01799v1-abstract-full').style.display = 'inline'; document.getElementById('2403.01799v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01799v1-abstract-full" style="display: none;"> Hyperspectral images (HSI) clustering is an important but challenging task. The state-of-the-art (SOTA) methods usually rely on superpixels, however, they do not fully utilize the spatial and spectral information in HSI 3-D structure, and their optimization targets are not clustering-oriented. In this work, we first use 3-D and 2-D hybrid convolutional neural networks to extract the high-order spatial and spectral features of HSI through pre-training, and then design a superpixel graph contrastive clustering (SPGCC) model to learn discriminative superpixel representations. Reasonable augmented views are crucial for contrastive clustering, and conventional contrastive learning may hurt the cluster structure since different samples are pushed away in the embedding space even if they belong to the same class. In SPGCC, we design two semantic-invariant data augmentations for HSI superpixels: pixel sampling augmentation and model weight augmentation. Then sample-level alignment and clustering-center-level contrast are performed for better intra-class similarity and inter-class dissimilarity of superpixel embeddings. We perform clustering and network optimization alternatively. Experimental results on several HSI datasets verify the advantages of the proposed method, e.g., on India Pines, our model improves the clustering accuracy from 58.79% to 67.59% compared to the SOTA method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01799v1-abstract-full').style.display = 'none'; document.getElementById('2403.01799v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00799">arXiv:2403.00799</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.00799">pdf</a>, <a href="https://arxiv.org/format/2403.00799">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of Data Ability Boundary in LLMs&#39; Math Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yezeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jiaqi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhijie Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00799v1-abstract-short" style="display: inline;"> Large language models (LLMs) are displaying emergent abilities for math reasoning tasks,and there is a growing attention on enhancing the ability of open-source LLMs through supervised fine-tuning (SFT).In this paper, we aim to explore a general data strategy for supervised data to help optimize and expand math reasoning ability.Firstly, we determine the ability boundary of reasoning paths augment&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00799v1-abstract-full').style.display = 'inline'; document.getElementById('2403.00799v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00799v1-abstract-full" style="display: none;"> Large language models (LLMs) are displaying emergent abilities for math reasoning tasks,and there is a growing attention on enhancing the ability of open-source LLMs through supervised fine-tuning (SFT).In this paper, we aim to explore a general data strategy for supervised data to help optimize and expand math reasoning ability.Firstly, we determine the ability boundary of reasoning paths augmentation by identifying these paths&#39; minimal optimal set.Secondly, we validate that different abilities of the model can be cumulatively enhanced by Mix of Minimal Optimal Sets of corresponding types of data, while our models MMOS achieve SOTA performance on series base models under much lower construction costs.Besides, we point out GSM-HARD is not really hard and today&#39;s LLMs no longer lack numerical robustness.Also, we provide an Auto Problem Generator for robustness testing and educational applications.Our code and data are publicly available at https://github.com/cyzhh/MMOS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00799v1-abstract-full').style.display = 'none'; document.getElementById('2403.00799v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04798">arXiv:2402.04798</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.04798">pdf</a>, <a href="https://arxiv.org/format/2402.04798">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spiking-PhysFormer: Camera-Based Remote Photoplethysmography with Parallel Spike-driven Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Mingxuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jiankai Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yongli Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoxiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiahao Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Siwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kegang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+J">Jie Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuntao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04798v3-abstract-short" style="display: inline;"> Artificial neural networks (ANNs) can help camera-based remote photoplethysmography (rPPG) in measuring cardiac activity and physiological signals from facial videos, such as pulse wave, heart rate and respiration rate with better accuracy. However, most existing ANN-based methods require substantial computing resources, which poses challenges for effective deployment on mobile devices. Spiking ne&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04798v3-abstract-full').style.display = 'inline'; document.getElementById('2402.04798v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04798v3-abstract-full" style="display: none;"> Artificial neural networks (ANNs) can help camera-based remote photoplethysmography (rPPG) in measuring cardiac activity and physiological signals from facial videos, such as pulse wave, heart rate and respiration rate with better accuracy. However, most existing ANN-based methods require substantial computing resources, which poses challenges for effective deployment on mobile devices. Spiking neural networks (SNNs), on the other hand, hold immense potential for energy-efficient deep learning owing to their binary and event-driven architecture. To the best of our knowledge, we are the first to introduce SNNs into the realm of rPPG, proposing a hybrid neural network (HNN) model, the Spiking-PhysFormer, aimed at reducing power consumption. Specifically, the proposed Spiking-PhyFormer consists of an ANN-based patch embedding block, SNN-based transformer blocks, and an ANN-based predictor head. First, to simplify the transformer block while preserving its capacity to aggregate local and global spatio-temporal features, we design a parallel spike transformer block to replace sequential sub-blocks. Additionally, we propose a simplified spiking self-attention mechanism that omits the value parameter without compromising the model&#39;s performance. Experiments conducted on four datasets-PURE, UBFC-rPPG, UBFC-Phys, and MMPD demonstrate that the proposed model achieves a 12.4\% reduction in power consumption compared to PhysFormer. Additionally, the power consumption of the transformer block is reduced by a factor of 12.2, while maintaining decent performance as PhysFormer and other ANN-based models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04798v3-abstract-full').style.display = 'none'; document.getElementById('2402.04798v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Mingxuan Liu, Jiankai Tang and Yongli Chen are co-first authors of the article</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04236">arXiv:2402.04236</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.04236">pdf</a>, <a href="https://arxiv.org/format/2402.04236">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CogCoM: Train Large Vision-Language Models Diving into Details through Chain of Manipulations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Weihan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yushi Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+Q">Qingsong Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+W">Wenyi Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Juanzi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04236v2-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have demonstrated their broad effectiveness thanks to extensive training in aligning visual instructions to responses. However, such training of conclusive alignment leads models to ignore essential visual reasoning, further resulting in failures in meticulous visual problems and unfaithful responses. Drawing inspiration from human cognition in solving visual problems&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04236v2-abstract-full').style.display = 'inline'; document.getElementById('2402.04236v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04236v2-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have demonstrated their broad effectiveness thanks to extensive training in aligning visual instructions to responses. However, such training of conclusive alignment leads models to ignore essential visual reasoning, further resulting in failures in meticulous visual problems and unfaithful responses. Drawing inspiration from human cognition in solving visual problems (e.g., marking, zoom in), this paper introduces Chain of Manipulations, a mechanism that enables VLMs to solve problems step-by-step with evidence. After training, models can solve various visual problems by eliciting intrinsic manipulations (e.g., grounding, zoom in) with results (e.g., boxes, image) actively without involving external tools, while also allowing users to trace error causes. We study the roadmap to implement this mechanism, including (1) a flexible design of manipulations upon extensive analysis, (2) an efficient automated data generation pipeline, (3) a compatible VLM architecture capable of multi-turn multi-image, and (4) a model training process for versatile capabilities. With the design, we also manually annotate 6K high-quality samples for the challenging graphical mathematical problems. Our trained model, \textbf{CogCoM}, equipped with this mechanism with 17B parameters achieves state-of-the-art performance across 9 benchmarks from 4 categories, demonstrating the effectiveness while preserving the interpretability. Our code, model weights, and collected data are publicly available at https://github.com/THUDM/CogCoM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04236v2-abstract-full').style.display = 'none'; document.getElementById('2402.04236v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.18058">arXiv:2401.18058</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.18058">pdf</a>, <a href="https://arxiv.org/format/2401.18058">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LongAlign: A Recipe for Long Context Alignment of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yushi Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xin Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiajie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yuze He</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Juanzi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.18058v1-abstract-short" style="display: inline;"> Extending large language models to effectively handle long contexts requires instruction fine-tuning on input sequences of similar length. To address this, we present LongAlign -- a recipe of the instruction data, training, and evaluation for long context alignment. First, we construct a long instruction-following dataset using Self-Instruct. To ensure the data diversity, it covers a broad range o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.18058v1-abstract-full').style.display = 'inline'; document.getElementById('2401.18058v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.18058v1-abstract-full" style="display: none;"> Extending large language models to effectively handle long contexts requires instruction fine-tuning on input sequences of similar length. To address this, we present LongAlign -- a recipe of the instruction data, training, and evaluation for long context alignment. First, we construct a long instruction-following dataset using Self-Instruct. To ensure the data diversity, it covers a broad range of tasks from various long context sources. Second, we adopt the packing and sorted batching strategies to speed up supervised fine-tuning on data with varied length distributions. Additionally, we develop a loss weighting method to balance the contribution to the loss across different sequences during packing training. Third, we introduce the LongBench-Chat benchmark for evaluating instruction-following capabilities on queries of 10k-100k in length. Experiments show that LongAlign outperforms existing recipes for LLMs in long context tasks by up to 30\%, while also maintaining their proficiency in handling short, generic tasks. The code, data, and long-aligned models are open-sourced at https://github.com/THUDM/LongAlign. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.18058v1-abstract-full').style.display = 'none'; document.getElementById('2401.18058v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12436">arXiv:2401.12436</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.12436">pdf</a>, <a href="https://arxiv.org/format/2401.12436">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Wasserstein Differential Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chengyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiayin Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+A">Aimin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12436v1-abstract-short" style="display: inline;"> Differential privacy (DP) has achieved remarkable results in the field of privacy-preserving machine learning. However, existing DP frameworks do not satisfy all the conditions for becoming metrics, which prevents them from deriving better basic private properties and leads to exaggerated values on privacy budgets. We propose Wasserstein differential privacy (WDP), an alternative DP framework to m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12436v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12436v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12436v1-abstract-full" style="display: none;"> Differential privacy (DP) has achieved remarkable results in the field of privacy-preserving machine learning. However, existing DP frameworks do not satisfy all the conditions for becoming metrics, which prevents them from deriving better basic private properties and leads to exaggerated values on privacy budgets. We propose Wasserstein differential privacy (WDP), an alternative DP framework to measure the risk of privacy leakage, which satisfies the properties of symmetry and triangle inequality. We show and prove that WDP has 13 excellent properties, which can be theoretical supports for the better performance of WDP than other DP frameworks. In addition, we derive a general privacy accounting method called Wasserstein accountant, which enables WDP to be applied in stochastic gradient descent (SGD) scenarios containing sub-sampling. Experiments on basic mechanisms, compositions and deep learning show that the privacy budgets obtained by Wasserstein accountant are relatively stable and less influenced by order. Moreover, the overestimation on privacy budgets can be effectively alleviated. The code is available at https://github.com/Hifipsysta/WDP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12436v1-abstract-full').style.display = 'none'; document.getElementById('2401.12436v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11818">arXiv:2401.11818</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.11818">pdf</a>, <a href="https://arxiv.org/format/2401.11818">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MInD: Improving Multimodal Sentiment Analysis via Multimodal Information Disentanglement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+W">Weichen Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xingyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+P">Pengbo Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+J">Jianlin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11818v2-abstract-short" style="display: inline;"> Learning effective joint representations has been a central task in multi-modal sentiment analysis. Previous works addressing this task focus on exploring sophisticated fusion techniques to enhance performance. However, the inherent heterogeneity of distinct modalities remains a core problem that brings challenges in fusing and coordinating the multi-modal signals at both the representational leve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11818v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11818v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11818v2-abstract-full" style="display: none;"> Learning effective joint representations has been a central task in multi-modal sentiment analysis. Previous works addressing this task focus on exploring sophisticated fusion techniques to enhance performance. However, the inherent heterogeneity of distinct modalities remains a core problem that brings challenges in fusing and coordinating the multi-modal signals at both the representational level and the informational level, impeding the full exploitation of multi-modal information. To address this problem, we propose the Multi-modal Information Disentanglement (MInD) method, which decomposes the multi-modal inputs into modality-invariant and modality-specific components through a shared encoder and multiple private encoders. Furthermore, by explicitly training generated noise in an adversarial manner, MInD is able to isolate uninformativeness, thus improves the learned representations. Therefore, the proposed disentangled decomposition allows for a fusion process that is simpler than alternative methods and results in improved performance. Experimental evaluations conducted on representative benchmark datasets demonstrate MInD&#39;s effectiveness in both multi-modal emotion recognition and multi-modal humor detection tasks. Code will be released upon acceptance of the paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11818v2-abstract-full').style.display = 'none'; document.getElementById('2401.11818v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11432">arXiv:2401.11432</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.11432">pdf</a>, <a href="https://arxiv.org/format/2401.11432">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Bimanual Deformable Bag Manipulation Using a Structure-of-Interest Based Neural Dynamics Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+P">Peng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+P">Pai Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiaming Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenxi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S">Samantha Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chenguang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Navarro-Alarcon%2C+D">David Navarro-Alarcon</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11432v2-abstract-short" style="display: inline;"> The manipulation of deformable objects by robotic systems presents a significant challenge due to their complex and infinite-dimensional configuration spaces. This paper introduces a novel approach to Deformable Object Manipulation (DOM) by emphasizing the identification and manipulation of Structures of Interest (SOIs) in deformable fabric bags. We propose a bimanual manipulation framework that l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11432v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11432v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11432v2-abstract-full" style="display: none;"> The manipulation of deformable objects by robotic systems presents a significant challenge due to their complex and infinite-dimensional configuration spaces. This paper introduces a novel approach to Deformable Object Manipulation (DOM) by emphasizing the identification and manipulation of Structures of Interest (SOIs) in deformable fabric bags. We propose a bimanual manipulation framework that leverages a Graph Neural Network (GNN)-based latent dynamics model to succinctly represent and predict the behavior of these SOIs. Our approach involves constructing a graph representation from partial point cloud data of the object and learning the latent dynamics model that effectively captures the essential deformations of the fabric bag within a reduced computational space. By integrating this latent dynamics model with Model Predictive Control (MPC), we empower robotic manipulators to perform precise and stable manipulation tasks focused on the SOIs. We have validated our framework through various empirical experiments demonstrating its efficacy in bimanual manipulation of fabric bags. Our contributions not only address the complexities inherent in DOM but also provide new perspectives and methodologies for enhancing robotic interactions with deformable objects by concentrating on their critical structural elements. Experimental videos can be obtained from https://sites.google.com/view/bagbot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11432v2-abstract-full').style.display = 'none'; document.getElementById('2401.11432v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10518">arXiv:2401.10518</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.10518">pdf</a>, <a href="https://arxiv.org/format/2401.10518">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Spatial-temporal Forecasting for Regions without Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Su%2C+X">Xinyu Su</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Tanin%2C+E">Egemen Tanin</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+Y">Yanchuan Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Sarvi%2C+M">Majid Sarvi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10518v1-abstract-short" style="display: inline;"> Spatial-temporal forecasting plays an important role in many real-world applications, such as traffic forecasting, air pollutant forecasting, crowd-flow forecasting, and so on. State-of-the-art spatial-temporal forecasting models take data-driven approaches and rely heavily on data availability. Such models suffer from accuracy issues when data is incomplete, which is common in reality due to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10518v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10518v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10518v1-abstract-full" style="display: none;"> Spatial-temporal forecasting plays an important role in many real-world applications, such as traffic forecasting, air pollutant forecasting, crowd-flow forecasting, and so on. State-of-the-art spatial-temporal forecasting models take data-driven approaches and rely heavily on data availability. Such models suffer from accuracy issues when data is incomplete, which is common in reality due to the heavy costs of deploying and maintaining sensors for data collection. A few recent studies attempted to address the issue of incomplete data. They typically assume some data availability in a region of interest either for a short period or at a few locations. In this paper, we further study spatial-temporal forecasting for a region of interest without any historical observations, to address scenarios such as unbalanced region development, progressive deployment of sensors or lack of open data. We propose a model named STSM for the task. The model takes a contrastive learning-based approach to learn spatial-temporal patterns from adjacent regions that have recorded data. Our key insight is to learn from the locations that resemble those in the region of interest, and we propose a selective masking strategy to enable the learning. As a result, our model outperforms adapted state-of-the-art models, reducing errors consistently over both traffic and air pollutant forecasting tasks. The source code is available at https://github.com/suzy0223/STSM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10518v1-abstract-full').style.display = 'none'; document.getElementById('2401.10518v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EDBT2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.02992">arXiv:2401.02992</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.02992">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Advanced Unstructured Data Processing for ESG Reports: A Methodology for Structured Transformation and Enhanced Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+J">Jiahui Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jing Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+X">Xin Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Jing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianchuan Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruiqiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Ming Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.02992v1-abstract-short" style="display: inline;"> In the evolving field of corporate sustainability, analyzing unstructured Environmental, Social, and Governance (ESG) reports is a complex challenge due to their varied formats and intricate content. This study introduces an innovative methodology utilizing the &#34;Unstructured Core Library&#34;, specifically tailored to address these challenges by transforming ESG reports into structured, analyzable for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.02992v1-abstract-full').style.display = 'inline'; document.getElementById('2401.02992v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.02992v1-abstract-full" style="display: none;"> In the evolving field of corporate sustainability, analyzing unstructured Environmental, Social, and Governance (ESG) reports is a complex challenge due to their varied formats and intricate content. This study introduces an innovative methodology utilizing the &#34;Unstructured Core Library&#34;, specifically tailored to address these challenges by transforming ESG reports into structured, analyzable formats. Our approach significantly advances the existing research by offering high-precision text cleaning, adept identification and extraction of text from images, and standardization of tables within these reports. Emphasizing its capability to handle diverse data types, including text, images, and tables, the method adeptly manages the nuances of differing page layouts and report styles across industries. This research marks a substantial contribution to the fields of industrial ecology and corporate sustainability assessment, paving the way for the application of advanced NLP technologies and large language models in the analysis of corporate governance and sustainability. Our code is available at https://github.com/linancn/TianGong-AI-Unstructure.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.02992v1-abstract-full').style.display = 'none'; document.getElementById('2401.02992v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01577">arXiv:2401.01577</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.01577">pdf</a>, <a href="https://arxiv.org/format/2401.01577">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Test-Time Personalization with Meta Prompt for Gaze Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Julia Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhenhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hassanpour%2C+M">Mohammad Hassanpour</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Plataniotis%2C+K">Konstantinos Plataniotis</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yuanhao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01577v3-abstract-short" style="display: inline;"> Despite the recent remarkable achievement in gaze estimation, efficient and accurate personalization of gaze estimation without labels is a practical problem but rarely touched on in the literature. To achieve efficient personalization, we take inspiration from the recent advances in Natural Language Processing (NLP) by updating a negligible number of parameters, &#34;prompts&#34;, at the test time. Speci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01577v3-abstract-full').style.display = 'inline'; document.getElementById('2401.01577v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01577v3-abstract-full" style="display: none;"> Despite the recent remarkable achievement in gaze estimation, efficient and accurate personalization of gaze estimation without labels is a practical problem but rarely touched on in the literature. To achieve efficient personalization, we take inspiration from the recent advances in Natural Language Processing (NLP) by updating a negligible number of parameters, &#34;prompts&#34;, at the test time. Specifically, the prompt is additionally attached without perturbing original network and can contain less than 1% of a ResNet-18&#39;s parameters. Our experiments show high efficiency of the prompt tuning approach. The proposed one can be 10 times faster in terms of adaptation speed than the methods compared. However, it is non-trivial to update the prompt for personalized gaze estimation without labels. At the test time, it is essential to ensure that the minimizing of particular unsupervised loss leads to the goals of minimizing gaze estimation error. To address this difficulty, we propose to meta-learn the prompt to ensure that its updates align with the goal. Our experiments show that the meta-learned prompt can be effectively adapted even with a simple symmetry loss. In addition, we experiment on four cross-dataset validations to show the remarkable advantages of the proposed method. Code is available at https://github.com/hmarkamcan/TPGaze. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01577v3-abstract-full').style.display = 'none'; document.getElementById('2401.01577v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17259">arXiv:2312.17259</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.17259">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Empowering Working Memory for Large Language Model Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Jing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianchuan Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruiqiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Y">Yuzhen Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Si Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Ming Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17259v2-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved impressive linguistic capabilities. However, a key limitation persists in their lack of human-like memory faculties. LLMs exhibit constrained memory retention across sequential interactions, hindering complex reasoning. This paper explores the potential of applying cognitive psychology&#39;s working memory frameworks, to enhance LLM architecture. The limitati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17259v2-abstract-full').style.display = 'inline'; document.getElementById('2312.17259v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17259v2-abstract-full" style="display: none;"> Large language models (LLMs) have achieved impressive linguistic capabilities. However, a key limitation persists in their lack of human-like memory faculties. LLMs exhibit constrained memory retention across sequential interactions, hindering complex reasoning. This paper explores the potential of applying cognitive psychology&#39;s working memory frameworks, to enhance LLM architecture. The limitations of traditional LLM memory designs are analyzed, including their isolation of distinct dialog episodes and lack of persistent memory links. To address this, an innovative model is proposed incorporating a centralized Working Memory Hub and Episodic Buffer access to retain memories across episodes. This architecture aims to provide greater continuity for nuanced contextual reasoning during intricate tasks and collaborative scenarios. While promising, further research is required into optimizing episodic memory encoding, storage, prioritization, retrieval, and security. Overall, this paper provides a strategic blueprint for developing LLM agents with more sophisticated, human-like memory capabilities, highlighting memory mechanisms as a vital frontier in artificial general intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17259v2-abstract-full').style.display = 'none'; document.getElementById('2312.17259v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.16355">arXiv:2312.16355</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.16355">pdf</a>, <a href="https://arxiv.org/format/2312.16355">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Efficient Cost Modeling of Space-filling Curves </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guanli Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Kulik%2C+L">Lars Kulik</a>, <a href="/search/cs?searchtype=author&amp;query=Jensen%2C+C+S">Christian S. Jensen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.16355v1-abstract-short" style="display: inline;"> A space-filling curve (SFC) maps points in a multi-dimensional space to one-dimensional points by discretizing the multi-dimensional space into cells and imposing a linear order on the cells. This way, an SFC enables the indexing of multi-dimensional data using a one-dimensional index such as a B+-tree. Choosing an appropriate SFC is crucial, as different SFCs have different effects on query perfo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16355v1-abstract-full').style.display = 'inline'; document.getElementById('2312.16355v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.16355v1-abstract-full" style="display: none;"> A space-filling curve (SFC) maps points in a multi-dimensional space to one-dimensional points by discretizing the multi-dimensional space into cells and imposing a linear order on the cells. This way, an SFC enables the indexing of multi-dimensional data using a one-dimensional index such as a B+-tree. Choosing an appropriate SFC is crucial, as different SFCs have different effects on query performance. Currently, there are two primary strategies: 1) deterministic schemes, which are computationally efficient but often yield suboptimal query performance, and 2) dynamic schemes, which consider a broad range of candidate SFCs based on cost functions but incur significant computational overhead. Despite these strategies, existing methods cannot efficiently measure the effectiveness of SFCs under heavy query workloads and numerous SFC options. To address this problem, we propose means of constant-time cost estimations that can enhance existing SFC selection algorithms, enabling them to learn more effective SFCs. Additionally, we propose an SFC learning method that leverages reinforcement learning and our cost estimation to choose an SFC pattern efficiently. Experimental studies offer evidence of the effectiveness and efficiency of the proposed means of cost estimation and SFC learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16355v1-abstract-full').style.display = 'none'; document.getElementById('2312.16355v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05402">arXiv:2312.05402</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.05402">pdf</a>, <a href="https://arxiv.org/format/2312.05402">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Controlled Table-to-Text Generation with Scientific Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhixin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jianping Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jiexing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+M">Mingxuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Ziwei He</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+G">Guanjie Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhouhan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinbing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Chenghu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05402v1-abstract-short" style="display: inline;"> The sheer volume of scientific experimental results and complex technical statements, often presented in tabular formats, presents a formidable barrier to individuals acquiring preferred information. The realms of scientific reasoning and content generation that adhere to user preferences encounter distinct challenges. In this work, we present a new task for generating fluent and logical descripti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05402v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05402v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05402v1-abstract-full" style="display: none;"> The sheer volume of scientific experimental results and complex technical statements, often presented in tabular formats, presents a formidable barrier to individuals acquiring preferred information. The realms of scientific reasoning and content generation that adhere to user preferences encounter distinct challenges. In this work, we present a new task for generating fluent and logical descriptions that match user preferences over scientific tabular data, aiming to automate scientific document analysis. To facilitate research in this direction, we construct a new challenging dataset CTRLSciTab consisting of table-description pairs extracted from the scientific literature, with highlighted cells and corresponding domain-specific knowledge base. We evaluated popular pre-trained language models to establish a baseline and proposed a novel architecture outperforming competing approaches. The results showed that large models struggle to produce accurate content that aligns with user preferences. As the first of its kind, our work should motivate further research in scientific domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05402v1-abstract-full').style.display = 'none'; document.getElementById('2312.05402v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Qi%2C+J&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10