Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 175 results for author: <span class="mathjax">Zhong, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhong%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhong, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhong%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhong, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09251">arXiv:2411.09251</a> <span> [<a href="https://arxiv.org/pdf/2411.09251">pdf</a>, <a href="https://arxiv.org/format/2411.09251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross Space and Time: A Spatio-Temporal Unitized Model for Traffic Flow Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ruan%2C+W">Weilin Ruan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenzhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Siru Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09251v1-abstract-short" style="display: inline;"> Predicting spatio-temporal traffic flow presents significant challenges due to complex interactions between spatial and temporal factors. Existing approaches often address these dimensions in isolation, neglecting their critical interdependencies. In this paper, we introduce the Spatio-Temporal Unitized Model (STUM), a unified framework designed to capture both spatial and temporal dependencies wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09251v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09251v1-abstract-full" style="display: none;"> Predicting spatio-temporal traffic flow presents significant challenges due to complex interactions between spatial and temporal factors. Existing approaches often address these dimensions in isolation, neglecting their critical interdependencies. In this paper, we introduce the Spatio-Temporal Unitized Model (STUM), a unified framework designed to capture both spatial and temporal dependencies while addressing spatio-temporal heterogeneity through techniques such as distribution alignment and feature fusion. It also ensures both predictive accuracy and computational efficiency. Central to STUM is the Adaptive Spatio-temporal Unitized Cell (ASTUC), which utilizes low-rank matrices to seamlessly store, update, and interact with space, time, as well as their correlations. Our framework is also modular, allowing it to integrate with various spatio-temporal graph neural networks through components such as backbone models, feature extractors, residual fusion blocks, and predictive modules to collectively enhance forecasting outcomes. Experimental results across multiple real-world datasets demonstrate that STUM consistently improves prediction performance with minimal computational cost. These findings are further supported by hyperparameter optimization, pre-training analysis, and result visualization. We provide our source code for reproducibility at https://anonymous.4open.science/r/STUM-E4F0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09251v1-abstract-full').style.display = 'none'; document.getElementById('2411.09251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06950">arXiv:2411.06950</a> <span> [<a href="https://arxiv.org/pdf/2411.06950">pdf</a>, <a href="https://arxiv.org/format/2411.06950">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Sniff AI: Is My 'Spicy' Your 'Spicy'? Exploring LLM's Perceptual Alignment with Human Smell Experiences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shu Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zetao Zhou</a>, <a href="/search/cs?searchtype=author&query=Dawes%2C+C">Christopher Dawes</a>, <a href="/search/cs?searchtype=author&query=Brianz%2C+G">Giada Brianz</a>, <a href="/search/cs?searchtype=author&query=Obrist%2C+M">Marianna Obrist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06950v1-abstract-short" style="display: inline;"> Aligning AI with human intent is important, yet perceptual alignment-how AI interprets what we see, hear, or smell-remains underexplored. This work focuses on olfaction, human smell experiences. We conducted a user study with 40 participants to investigate how well AI can interpret human descriptions of scents. Participants performed "sniff and describe" interactive tasks, with our designed AI sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06950v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06950v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06950v1-abstract-full" style="display: none;"> Aligning AI with human intent is important, yet perceptual alignment-how AI interprets what we see, hear, or smell-remains underexplored. This work focuses on olfaction, human smell experiences. We conducted a user study with 40 participants to investigate how well AI can interpret human descriptions of scents. Participants performed "sniff and describe" interactive tasks, with our designed AI system attempting to guess what scent the participants were experiencing based on their descriptions. These tasks evaluated the Large Language Model's (LLMs) contextual understanding and representation of scent relationships within its internal states - high-dimensional embedding space. Both quantitative and qualitative methods were used to evaluate the AI system's performance. Results indicated limited perceptual alignment, with biases towards certain scents, like lemon and peppermint, and continued failing to identify others, like rosemary. We discuss these findings in light of human-AI alignment advancements, highlighting the limitations and opportunities for enhancing HCI systems with multisensory experience integration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06950v1-abstract-full').style.display = 'none'; document.getElementById('2411.06950v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06175">arXiv:2411.06175</a> <span> [<a href="https://arxiv.org/pdf/2411.06175">pdf</a>, <a href="https://arxiv.org/format/2411.06175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Clustering Algorithms and RAG Enhancing Semi-Supervised Text Classification with Large LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shan Zhong</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jiahao Zeng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yongxin Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bohong Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06175v1-abstract-short" style="display: inline;"> This paper introduces an innovative semi-supervised learning approach for text classification, addressing the challenge of abundant data but limited labeled examples. Our methodology integrates few-shot learning with retrieval-augmented generation (RAG) and conventional statistical clustering, enabling effective learning from a minimal number of labeled instances while generating high-quality labe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06175v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06175v1-abstract-full" style="display: none;"> This paper introduces an innovative semi-supervised learning approach for text classification, addressing the challenge of abundant data but limited labeled examples. Our methodology integrates few-shot learning with retrieval-augmented generation (RAG) and conventional statistical clustering, enabling effective learning from a minimal number of labeled instances while generating high-quality labeled data. To the best of our knowledge, we are the first to incorporate RAG alongside clustering in text data generation. Our experiments on the Reuters and Web of Science datasets demonstrate state-of-the-art performance, with few-shot augmented data alone producing results nearly equivalent to those achieved with fully labeled datasets. Notably, accuracies of 95.41\% and 82.43\% were achieved for complex text document classification tasks, where the number of categories can exceed 100. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06175v1-abstract-full').style.display = 'none'; document.getElementById('2411.06175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06159">arXiv:2411.06159</a> <span> [<a href="https://arxiv.org/pdf/2411.06159">pdf</a>, <a href="https://arxiv.org/format/2411.06159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> From References to Insights: Collaborative Knowledge Minigraph Agents for Automating Scholarly Literature Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng-hua Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiannong Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06159v1-abstract-short" style="display: inline;"> Literature reviews play a crucial role in scientific research for understanding the current state of research, identifying gaps, and guiding future studies on specific topics. However, the process of conducting a comprehensive literature review is yet time-consuming. This paper proposes a novel framework, collaborative knowledge minigraph agents (CKMAs), to automate scholarly literature reviews. A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06159v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06159v1-abstract-full" style="display: none;"> Literature reviews play a crucial role in scientific research for understanding the current state of research, identifying gaps, and guiding future studies on specific topics. However, the process of conducting a comprehensive literature review is yet time-consuming. This paper proposes a novel framework, collaborative knowledge minigraph agents (CKMAs), to automate scholarly literature reviews. A novel prompt-based algorithm, the knowledge minigraph construction agent (KMCA), is designed to identify relationships between information pieces from academic literature and automatically constructs knowledge minigraphs. By leveraging the capabilities of large language models on constructed knowledge minigraphs, the multiple path summarization agent (MPSA) efficiently organizes information pieces and relationships from different viewpoints to generate literature review paragraphs. We evaluate CKMAs on three benchmark datasets. Experimental results demonstrate that the proposed techniques generate informative, complete, consistent, and insightful summaries for different research problems, promoting the use of LLMs in more professional fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06159v1-abstract-full').style.display = 'none'; document.getElementById('2411.06159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00394">arXiv:2411.00394</a> <span> [<a href="https://arxiv.org/pdf/2411.00394">pdf</a>, <a href="https://arxiv.org/format/2411.00394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Right this way: Can VLMs Guide Us to See More to Answer Questions? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diji Yang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sijia Zhong</a>, <a href="/search/cs?searchtype=author&query=Tholeti%2C+K+S+S">Kalyana Suma Sree Tholeti</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Lei Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gilpin%2C+L+H">Leilani H. Gilpin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00394v1-abstract-short" style="display: inline;"> In question-answering scenarios, humans can assess whether the available information is sufficient and seek additional information if necessary, rather than providing a forced answer. In contrast, Vision Language Models (VLMs) typically generate direct, one-shot responses without evaluating the sufficiency of the information. To investigate this gap, we identify a critical and challenging task in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00394v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00394v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00394v1-abstract-full" style="display: none;"> In question-answering scenarios, humans can assess whether the available information is sufficient and seek additional information if necessary, rather than providing a forced answer. In contrast, Vision Language Models (VLMs) typically generate direct, one-shot responses without evaluating the sufficiency of the information. To investigate this gap, we identify a critical and challenging task in the Visual Question Answering (VQA) scenario: can VLMs indicate how to adjust an image when the visual information is insufficient to answer a question? This capability is especially valuable for assisting visually impaired individuals who often need guidance to capture images correctly. To evaluate this capability of current VLMs, we introduce a human-labeled dataset as a benchmark for this task. Additionally, we present an automated framework that generates synthetic training data by simulating ``where to know'' scenarios. Our empirical results show significant performance improvements in mainstream VLMs when fine-tuned with this synthetic data. This study demonstrates the potential to narrow the gap between information assessment and acquisition in VLMs, bringing their performance closer to humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00394v1-abstract-full').style.display = 'none'; document.getElementById('2411.00394v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23822">arXiv:2410.23822</a> <span> [<a href="https://arxiv.org/pdf/2410.23822">pdf</a>, <a href="https://arxiv.org/ps/2410.23822">ps</a>, <a href="https://arxiv.org/format/2410.23822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Efficient Fine-Tuning Medical Multimodal Large Language Models for Medical Visual Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jinlong He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengfei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shenjun Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23822v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) inherit the superior text understanding capabilities of LLMs and extend these capabilities to multimodal scenarios. These models achieve excellent results in the general domain of multimodal tasks. However, in the medical domain, the substantial training costs and the requirement for extensive medical data pose challenges to the development of medical MLLMs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23822v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23822v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) inherit the superior text understanding capabilities of LLMs and extend these capabilities to multimodal scenarios. These models achieve excellent results in the general domain of multimodal tasks. However, in the medical domain, the substantial training costs and the requirement for extensive medical data pose challenges to the development of medical MLLMs. Furthermore, due to the free-text form of answers, tasks such as visual grounding that need to produce output in a prescribed form become difficult for MLLMs. So far, there have been no medical MLLMs works in medical visual grounding area. For the medical vision grounding task, which involves identifying locations in medical images based on short text descriptions, we propose Parameter-efficient Fine-tuning medical multimodal large language models for Medcial Visual Grounding (PFMVG). To validate the performance of the model, we evaluate it on a public benchmark dataset for medical visual grounding, where it achieves competitive results, and significantly outperforming GPT-4v. Our code will be open sourced after peer review. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23822v1-abstract-full').style.display = 'none'; document.getElementById('2410.23822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23109">arXiv:2410.23109</a> <span> [<a href="https://arxiv.org/pdf/2410.23109">pdf</a>, <a href="https://arxiv.org/format/2410.23109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Geometry">cs.CG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> NASM: Neural Anisotropic Surface Meshing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongbo Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haikuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sikai Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ningna Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaohu Guo</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+S">Shiqing Xin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jing Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zichun Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23109v2-abstract-short" style="display: inline;"> This paper introduces a new learning-based method, NASM, for anisotropic surface meshing. Our key idea is to propose a graph neural network to embed an input mesh into a high-dimensional (high-d) Euclidean embedding space to preserve curvature-based anisotropic metric by using a dot product loss between high-d edge vectors. This can dramatically reduce the computational time and increase the scala… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23109v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23109v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23109v2-abstract-full" style="display: none;"> This paper introduces a new learning-based method, NASM, for anisotropic surface meshing. Our key idea is to propose a graph neural network to embed an input mesh into a high-dimensional (high-d) Euclidean embedding space to preserve curvature-based anisotropic metric by using a dot product loss between high-d edge vectors. This can dramatically reduce the computational time and increase the scalability. Then, we propose a novel feature-sensitive remeshing on the generated high-d embedding to automatically capture sharp geometric features. We define a high-d normal metric, and then derive an automatic differentiation on a high-d centroidal Voronoi tessellation (CVT) optimization with the normal metric to simultaneously preserve geometric features and curvature anisotropy that exhibit in the original 3D shapes. To our knowledge, this is the first time that a deep learning framework and a large dataset are proposed to construct a high-d Euclidean embedding space for 3D anisotropic surface meshing. Experimental results are evaluated and compared with the state-of-the-art in anisotropic surface meshing on a large number of surface models from Thingi10K dataset as well as tested on extensive unseen 3D shapes from Multi-Garment Network dataset and FAUST human dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23109v2-abstract-full').style.display = 'none'; document.getElementById('2410.23109v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SIGGRAPH Asia 2024 (Conference Track)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21731">arXiv:2410.21731</a> <span> [<a href="https://arxiv.org/pdf/2410.21731">pdf</a>, <a href="https://arxiv.org/format/2410.21731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3698829">10.1145/3698829 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Understanding and Reusing Test Suites Across Database Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Suyang Zhong</a>, <a href="/search/cs?searchtype=author&query=Rigger%2C+M">Manuel Rigger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21731v1-abstract-short" style="display: inline;"> Database Management System (DBMS) developers have implemented extensive test suites to test their DBMSs. For example, the SQLite test suites contain over 92 million lines of code. Despite these extensive efforts, test suites are not systematically reused across DBMSs, leading to wasted effort. Integration is challenging, as test suites use various test case formats and rely on unstandardized test… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21731v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21731v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21731v1-abstract-full" style="display: none;"> Database Management System (DBMS) developers have implemented extensive test suites to test their DBMSs. For example, the SQLite test suites contain over 92 million lines of code. Despite these extensive efforts, test suites are not systematically reused across DBMSs, leading to wasted effort. Integration is challenging, as test suites use various test case formats and rely on unstandardized test runner features. We present a unified test suite, SQuaLity, in which we integrated test cases from three widely-used DBMSs, SQLite, PostgreSQL, and DuckDB. In addition, we present an empirical study to determine the potential of reusing these systems' test suites. Our results indicate that reusing test suites is challenging: First, test formats and test runner commands vary widely; for example, SQLite has 4 test runner commands, while MySQL has 112 commands with additional features, to, for example, execute file operations or interact with a shell. Second, while some test suites contain mostly standard-compliant statements (e.g., 99% in SQLite), other test suites mostly test non-standardized functionality (e.g., 31% of statements in the PostgreSQL test suite are nonstandardized). Third, test reuse is complicated by various explicit and implicit dependencies, such as the need to set variables and configurations, certain test cases requiring extensions not present by default, and query results depending on specific clients. Despite the above findings, we have identified 3 crashes, 3 hangs, and multiple compatibility issues across four different DBMSs by executing test suites across DBMSs, indicating the benefits of reuse. Overall, this work represents the first step towards test-case reuse in the context of DBMSs, and we hope that it will inspire follow-up work on this important topic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21731v1-abstract-full').style.display = 'none'; document.getElementById('2410.21731v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.5; H.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21160">arXiv:2410.21160</a> <span> [<a href="https://arxiv.org/pdf/2410.21160">pdf</a>, <a href="https://arxiv.org/format/2410.21160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KaLDeX: Kalman Filter based Linear Deformable Cross Attention for Retina Vessel Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Faghihroohi%2C+S">Shahrooz Faghihroohi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yinzheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junjie Yang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shipeng Zhong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kai Huang</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyang Li</a>, <a href="/search/cs?searchtype=author&query=Nasseri%2C+M+A">M. Ali Nasseri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21160v1-abstract-short" style="display: inline;"> Background and Objective: In the realm of ophthalmic imaging, accurate vascular segmentation is paramount for diagnosing and managing various eye diseases. Contemporary deep learning-based vascular segmentation models rival human accuracy but still face substantial challenges in accurately segmenting minuscule blood vessels in neural network applications. Due to the necessity of multiple downsampl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21160v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21160v1-abstract-full" style="display: none;"> Background and Objective: In the realm of ophthalmic imaging, accurate vascular segmentation is paramount for diagnosing and managing various eye diseases. Contemporary deep learning-based vascular segmentation models rival human accuracy but still face substantial challenges in accurately segmenting minuscule blood vessels in neural network applications. Due to the necessity of multiple downsampling operations in the CNN models, fine details from high-resolution images are inevitably lost. The objective of this study is to design a structure to capture the delicate and small blood vessels. Methods: To address these issues, we propose a novel network (KaLDeX) for vascular segmentation leveraging a Kalman filter based linear deformable cross attention (LDCA) module, integrated within a UNet++ framework. Our approach is based on two key components: Kalman filter (KF) based linear deformable convolution (LD) and cross-attention (CA) modules. The LD module is designed to adaptively adjust the focus on thin vessels that might be overlooked in standard convolution. The CA module improves the global understanding of vascular structures by aggregating the detailed features from the LD module with the high level features from the UNet++ architecture. Finally, we adopt a topological loss function based on persistent homology to constrain the topological continuity of the segmentation. Results: The proposed method is evaluated on retinal fundus image datasets (DRIVE, CHASE_BD1, and STARE) as well as the 3mm and 6mm of the OCTA-500 dataset, achieving an average accuracy (ACC) of 97.25%, 97.77%, 97.85%, 98.89%, and 98.21%, respectively. Conclusions: Empirical evidence shows that our method outperforms the current best models on different vessel segmentation datasets. Our source code is available at: https://github.com/AIEyeSystem/KalDeX. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21160v1-abstract-full').style.display = 'none'; document.getElementById('2410.21160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19548">arXiv:2410.19548</a> <span> [<a href="https://arxiv.org/pdf/2410.19548">pdf</a>, <a href="https://arxiv.org/format/2410.19548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving Federated Learning via Dataset Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">ShiMao Xu</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+X">Xiaopeng Ke</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xing Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shucheng Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengyuan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19548v3-abstract-short" style="display: inline;"> Federated Learning (FL) allows users to share knowledge instead of raw data to train a model with high accuracy. Unfortunately, during the training, users lose control over the knowledge shared, which causes serious data privacy issues. We hold that users are only willing and need to share the essential knowledge to the training task to obtain the FL model with high accuracy. However, existing eff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19548v3-abstract-full').style.display = 'inline'; document.getElementById('2410.19548v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19548v3-abstract-full" style="display: none;"> Federated Learning (FL) allows users to share knowledge instead of raw data to train a model with high accuracy. Unfortunately, during the training, users lose control over the knowledge shared, which causes serious data privacy issues. We hold that users are only willing and need to share the essential knowledge to the training task to obtain the FL model with high accuracy. However, existing efforts cannot help users minimize the shared knowledge according to the user intention in the FL training procedure. This work proposes FLiP, which aims to bring the principle of least privilege (PoLP) to FL training. The key design of FLiP is applying elaborate information reduction on the training data through a local-global dataset distillation design. We measure the privacy performance through attribute inference and membership inference attacks. Extensive experiments show that FLiP strikes a good balance between model accuracy and privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19548v3-abstract-full').style.display = 'none'; document.getElementById('2410.19548v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15288">arXiv:2410.15288</a> <span> [<a href="https://arxiv.org/pdf/2410.15288">pdf</a>, <a href="https://arxiv.org/format/2410.15288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Attention Is All You Need for LLM-based Code Vulnerability Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yue Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiuzhen Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengyuan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15288v1-abstract-short" style="display: inline;"> The rapid expansion of software systems and the growing number of reported vulnerabilities have emphasized the importance of accurately identifying vulnerable code segments. Traditional methods for vulnerability localization, such as manual code audits or rule-based tools, are often time-consuming and limited in scope, typically focusing on specific programming languages or types of vulnerabilitie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15288v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15288v1-abstract-full" style="display: none;"> The rapid expansion of software systems and the growing number of reported vulnerabilities have emphasized the importance of accurately identifying vulnerable code segments. Traditional methods for vulnerability localization, such as manual code audits or rule-based tools, are often time-consuming and limited in scope, typically focusing on specific programming languages or types of vulnerabilities. In recent years, the introduction of large language models (LLMs) such as GPT and LLaMA has opened new possibilities for automating vulnerability detection. However, while LLMs show promise in this area, they face challenges, particularly in maintaining accuracy over longer code contexts. This paper introduces LOVA, a novel framework leveraging the self-attention mechanisms inherent in LLMs to enhance vulnerability localization. Our key insight is that self-attention mechanisms assign varying importance to different parts of the input, making it possible to track how much attention the model focuses on specific lines of code. In the context of vulnerability localization, the hypothesis is that vulnerable lines of code will naturally attract higher attention weights because they have a greater influence on the model's output. By systematically tracking changes in attention weights and focusing on specific lines of code, LOVA improves the precision of identifying vulnerable lines across various programming languages. Through rigorous experimentation and evaluation, we demonstrate that LOVA significantly outperforms existing LLM-based approaches, achieving up to a 5.3x improvement in F1-scores. LOVA also demonstrated strong scalability, with up to a 14.6x improvement in smart contract vulnerability localization across languages like C, Python, Java, and Solidity. Its robustness was proven through consistent performance across different LLM architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15288v1-abstract-full').style.display = 'none'; document.getElementById('2410.15288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14257">arXiv:2410.14257</a> <span> [<a href="https://arxiv.org/pdf/2410.14257">pdf</a>, <a href="https://arxiv.org/format/2410.14257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Revisiting SLO and Goodput Metrics in LLM Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhibin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shipeng Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuhang Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xue Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+R">Rong Gu</a>, <a href="/search/cs?searchtype=author&query=Cam-Tu%2C+N">Nguyen Cam-Tu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chen Tian</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14257v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved remarkable performance and are widely deployed in various applications, while the serving of LLM inference has raised concerns about user experience and serving throughput. Accordingly, service level objectives (SLOs) and goodput-the number of requests that meet SLOs per second-are introduced to evaluate the performance of LLM serving. However, existing m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14257v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14257v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved remarkable performance and are widely deployed in various applications, while the serving of LLM inference has raised concerns about user experience and serving throughput. Accordingly, service level objectives (SLOs) and goodput-the number of requests that meet SLOs per second-are introduced to evaluate the performance of LLM serving. However, existing metrics fail to capture the nature of user experience. We observe two ridiculous phenomena in existing metrics: 1) delaying token delivery can smooth the tail time between tokens (tail TBT) of a request and 2) dropping the request that fails to meet the SLOs midway can improve goodput. In this paper, we revisit SLO and goodput metrics in LLM serving and propose a unified metric framework smooth goodput including SLOs and goodput to reflect the nature of user experience in LLM serving. The framework can adapt to specific goals of different tasks by setting parameters. We re-evaluate the performance of different LLM serving systems under multiple workloads based on this unified framework and provide possible directions for future optimization of existing strategies. We hope that this framework can provide a unified standard for evaluating LLM serving and foster researches in the field of LLM serving optimization to move in a cohesive direction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14257v1-abstract-full').style.display = 'none'; document.getElementById('2410.14257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12463">arXiv:2410.12463</a> <span> [<a href="https://arxiv.org/pdf/2410.12463">pdf</a>, <a href="https://arxiv.org/format/2410.12463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RADS-Checker: Measuring Compliance with Right of Access by the Data Subject in Android Markets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenhua Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhanpeng Liang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Congcong Yao</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jingyu Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12463v1-abstract-short" style="display: inline;"> The latest data protection regulations worldwide, such as the General Data Protection Regulation (GDPR), have established the Right of Access by the Data Subject (RADS), granting users the right to access and obtain a copy of their personal data from the data controllers. This clause can effectively compel data controllers to handle user personal data more cautiously, which is of significant impor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12463v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12463v1-abstract-full" style="display: none;"> The latest data protection regulations worldwide, such as the General Data Protection Regulation (GDPR), have established the Right of Access by the Data Subject (RADS), granting users the right to access and obtain a copy of their personal data from the data controllers. This clause can effectively compel data controllers to handle user personal data more cautiously, which is of significant importance for protecting user privacy. However, there is currently no research systematically examining whether RADS has been effectively implemented in mobile apps, which are the most common personal data controllers. In this study, we propose a compliance measurement framework for RADS in apps. In our framework, we first analyze an app's privacy policy text using NLP techniques such as GPT-4 to verify whether it clearly declares offering RADS to users and provides specific details on how the right can be exercised. Next, we assess the authenticity and usability of the identified implementation methods by submitting data access requests to the app. Finally, for the obtained data copies, we further verify their completeness by comparing them with the user personal data actually collected by the app during runtime, as captured by Frida Hook. We analyzed a total of 1,631 apps in the American app market G and the Chinese app market H. The results show that less than 54.50% and 37.05% of apps in G and H, respectively, explicitly state in their privacy policies that they can provide users with copies of their personal data. Additionally, in both app markets, less than 20% of apps could truly provide users with their data copies. Finally, among the obtained data copies, only about 2.94% from G pass the completeness verification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12463v1-abstract-full').style.display = 'none'; document.getElementById('2410.12463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09531">arXiv:2410.09531</a> <span> [<a href="https://arxiv.org/pdf/2410.09531">pdf</a>, <a href="https://arxiv.org/format/2410.09531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3676536.3676661">10.1145/3676536.3676661 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> PrivQuant: Communication-Efficient Private Inference with Quantized Network/Protocol Co-Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianshi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shuzhang Zhong</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wenxuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09531v1-abstract-short" style="display: inline;"> Private deep neural network (DNN) inference based on secure two-party computation (2PC) enables secure privacy protection for both the server and the client. However, existing secure 2PC frameworks suffer from a high inference latency due to enormous communication. As the communication of both linear and non-linear DNN layers reduces with the bit widths of weight and activation, in this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09531v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09531v1-abstract-full" style="display: none;"> Private deep neural network (DNN) inference based on secure two-party computation (2PC) enables secure privacy protection for both the server and the client. However, existing secure 2PC frameworks suffer from a high inference latency due to enormous communication. As the communication of both linear and non-linear DNN layers reduces with the bit widths of weight and activation, in this paper, we propose PrivQuant, a framework that jointly optimizes the 2PC-based quantized inference protocols and the network quantization algorithm, enabling communication-efficient private inference. PrivQuant proposes DNN architecture-aware optimizations for the 2PC protocols for communication-intensive quantized operators and conducts graph-level operator fusion for communication reduction. Moreover, PrivQuant also develops a communication-aware mixed precision quantization algorithm to improve inference efficiency while maintaining high accuracy. The network/protocol co-optimization enables PrivQuant to outperform prior-art 2PC frameworks. With extensive experiments, we demonstrate PrivQuant reduces communication by $11\times, 2.5\times \mathrm{and}~ 2.8\times$, which results in $8.7\times, 1.8\times ~ \mathrm{and}~ 2.4\times$ latency reduction compared with SiRNN, COINN, and CoPriv, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09531v1-abstract-full').style.display = 'none'; document.getElementById('2410.09531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCAD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09374">arXiv:2410.09374</a> <span> [<a href="https://arxiv.org/pdf/2410.09374">pdf</a>, <a href="https://arxiv.org/format/2410.09374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+J">Junkai Niu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiuyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Gallego%2C+G">Guillermo Gallego</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09374v1-abstract-short" style="display: inline;"> Event-based visual odometry is a specific branch of visual Simultaneous Localization and Mapping (SLAM) techniques, which aims at solving tracking and mapping sub-problems in parallel by exploiting the special working principles of neuromorphic (ie, event-based) cameras. Due to the motion-dependent nature of event data, explicit data association ie, feature matching under large-baseline view-point… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09374v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09374v1-abstract-full" style="display: none;"> Event-based visual odometry is a specific branch of visual Simultaneous Localization and Mapping (SLAM) techniques, which aims at solving tracking and mapping sub-problems in parallel by exploiting the special working principles of neuromorphic (ie, event-based) cameras. Due to the motion-dependent nature of event data, explicit data association ie, feature matching under large-baseline view-point changes is hardly established, making direct methods a more rational choice. However, state-of-the-art direct methods are limited by the high computational complexity of the mapping sub-problem and the degeneracy of camera pose tracking in certain degrees of freedom (DoF) in rotation. In this paper, we resolve these issues by building an event-based stereo visual-inertial odometry system on top of our previous direct pipeline Event-based Stereo Visual Odometry. Specifically, to speed up the mapping operation, we propose an efficient strategy for sampling contour points according to the local dynamics of events. The mapping performance is also improved in terms of structure completeness and local smoothness by merging the temporal stereo and static stereo results. To circumvent the degeneracy of camera pose tracking in recovering the pitch and yaw components of general six-DoF motion, we introduce IMU measurements as motion priors via pre-integration. To this end, a compact back-end is proposed for continuously updating the IMU bias and predicting the linear velocity, enabling an accurate motion prediction for camera pose tracking. The resulting system scales well with modern high-resolution event cameras and leads to better global positioning accuracy in large-scale outdoor environments. Extensive evaluations on five publicly available datasets featuring different resolutions and scenarios justify the superior performance of the proposed system against five state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09374v1-abstract-full').style.display = 'none'; document.getElementById('2410.09374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08794">arXiv:2410.08794</a> <span> [<a href="https://arxiv.org/pdf/2410.08794">pdf</a>, <a href="https://arxiv.org/format/2410.08794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> M$^3$-Impute: Mask-guided Representation Learning for Missing Value Imputation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhongyi Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenghao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shuhan Zhong</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Weifeng Su</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+S+-+G">S. -H. Gary Chan</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chul-Ho Lee</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+W">Weipeng Zhuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08794v1-abstract-short" style="display: inline;"> Missing values are a common problem that poses significant challenges to data analysis and machine learning. This problem necessitates the development of an effective imputation method to fill in the missing values accurately, thereby enhancing the overall quality and utility of the datasets. Existing imputation methods, however, fall short of explicitly considering the `missingness' information i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08794v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08794v1-abstract-full" style="display: none;"> Missing values are a common problem that poses significant challenges to data analysis and machine learning. This problem necessitates the development of an effective imputation method to fill in the missing values accurately, thereby enhancing the overall quality and utility of the datasets. Existing imputation methods, however, fall short of explicitly considering the `missingness' information in the data during the embedding initialization stage and modeling the entangled feature and sample correlations during the learning process, thus leading to inferior performance. We propose M$^3$-Impute, which aims to explicitly leverage the missingness information and such correlations with novel masking schemes. M$^3$-Impute first models the data as a bipartite graph and uses a graph neural network to learn node embeddings, where the refined embedding initialization process directly incorporates the missingness information. They are then optimized through M$^3$-Impute's novel feature correlation unit (FRU) and sample correlation unit (SRU) that effectively captures feature and sample correlations for imputation. Experiment results on 25 benchmark datasets under three different missingness settings show the effectiveness of M$^3$-Impute by achieving 20 best and 4 second-best MAE scores on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08794v1-abstract-full').style.display = 'none'; document.getElementById('2410.08794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05331">arXiv:2410.05331</a> <span> [<a href="https://arxiv.org/pdf/2410.05331">pdf</a>, <a href="https://arxiv.org/format/2410.05331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Taylor Unswift: Secured Weight Release for Large Language Models via Taylor Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanchu Wang</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+Y">Yu-Neng Chuang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shaochen Zhong</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiayi Yuan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hongye Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+V">Vipin Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shuai Xu</a>, <a href="/search/cs?searchtype=author&query=Caverlee%2C+J">James Caverlee</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05331v1-abstract-short" style="display: inline;"> Ensuring the security of released large language models (LLMs) poses a significant dilemma, as existing mechanisms either compromise ownership rights or raise data privacy concerns. To address this dilemma, we introduce TaylorMLP to protect the ownership of released LLMs and prevent their abuse. Specifically, TaylorMLP preserves the ownership of LLMs by transforming the weights of LLMs into parame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05331v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05331v1-abstract-full" style="display: none;"> Ensuring the security of released large language models (LLMs) poses a significant dilemma, as existing mechanisms either compromise ownership rights or raise data privacy concerns. To address this dilemma, we introduce TaylorMLP to protect the ownership of released LLMs and prevent their abuse. Specifically, TaylorMLP preserves the ownership of LLMs by transforming the weights of LLMs into parameters of Taylor-series. Instead of releasing the original weights, developers can release the Taylor-series parameters with users, thereby ensuring the security of LLMs. Moreover, TaylorMLP can prevent abuse of LLMs by adjusting the generation speed. It can induce low-speed token generation for the protected LLMs by increasing the terms in the Taylor-series. This intentional delay helps LLM developers prevent potential large-scale unauthorized uses of their models. Empirical experiments across five datasets and three LLM architectures demonstrate that TaylorMLP induces over 4x increase in latency, producing the tokens precisely matched with original LLMs. Subsequent defensive experiments further confirm that TaylorMLP effectively prevents users from reconstructing the weight values based on downstream datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05331v1-abstract-full').style.display = 'none'; document.getElementById('2410.05331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00486">arXiv:2410.00486</a> <span> [<a href="https://arxiv.org/pdf/2410.00486">pdf</a>, <a href="https://arxiv.org/format/2410.00486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CaRtGS: Computational Alignment for Real-Time Gaussian Splatting SLAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+D">Dapeng Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhiqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yizhen Yin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shipeng Zhong</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuhua Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongbo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00486v2-abstract-short" style="display: inline;"> Simultaneous Localization and Mapping (SLAM) is pivotal in robotics, with photorealistic scene reconstruction emerging as a key challenge. To address this, we introduce Computational Alignment for Real-Time Gaussian Splatting SLAM (CaRtGS), a novel method enhancing the efficiency and quality of photorealistic scene reconstruction in real-time environments. Leveraging 3D Gaussian Splatting (3DGS),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00486v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00486v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00486v2-abstract-full" style="display: none;"> Simultaneous Localization and Mapping (SLAM) is pivotal in robotics, with photorealistic scene reconstruction emerging as a key challenge. To address this, we introduce Computational Alignment for Real-Time Gaussian Splatting SLAM (CaRtGS), a novel method enhancing the efficiency and quality of photorealistic scene reconstruction in real-time environments. Leveraging 3D Gaussian Splatting (3DGS), CaRtGS achieves superior rendering quality and processing speed, which is crucial for scene photorealistic reconstruction. Our approach tackles computational misalignment in Gaussian Splatting SLAM (GS-SLAM) through an adaptive strategy that optimizes training, addresses long-tail optimization, and refines densification. Experiments on Replica and TUM-RGBD datasets demonstrate CaRtGS's effectiveness in achieving high-fidelity rendering with fewer Gaussian primitives. This work propels SLAM towards real-time, photorealistic dense rendering, significantly advancing photorealistic scene representation. For the benefit of the research community, we release the code on our project website: https://dapengfeng.github.io/cartgs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00486v2-abstract-full').style.display = 'none'; document.getElementById('2410.00486v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Upon a thorough internal review, we have identified that our manuscript lacks proper citation for a critical expression within the methodology section. In this revised version, we add Taming-3DGS as a citation in the splat-wise backpropagation statement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18897">arXiv:2409.18897</a> <span> [<a href="https://arxiv.org/pdf/2409.18897">pdf</a>, <a href="https://arxiv.org/format/2409.18897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detecting Dataset Abuse in Fine-Tuning Stable Diffusion Models for Text-to-Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Songrui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yubo Zhu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wei Tong</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18897v1-abstract-short" style="display: inline;"> Text-to-image synthesis has become highly popular for generating realistic and stylized images, often requiring fine-tuning generative models with domain-specific datasets for specialized tasks. However, these valuable datasets face risks of unauthorized usage and unapproved sharing, compromising the rights of the owners. In this paper, we address the issue of dataset abuse during the fine-tuning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18897v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18897v1-abstract-full" style="display: none;"> Text-to-image synthesis has become highly popular for generating realistic and stylized images, often requiring fine-tuning generative models with domain-specific datasets for specialized tasks. However, these valuable datasets face risks of unauthorized usage and unapproved sharing, compromising the rights of the owners. In this paper, we address the issue of dataset abuse during the fine-tuning of Stable Diffusion models for text-to-image synthesis. We present a dataset watermarking framework designed to detect unauthorized usage and trace data leaks. The framework employs two key strategies across multiple watermarking schemes and is effective for large-scale dataset authorization. Extensive experiments demonstrate the framework's effectiveness, minimal impact on the dataset (only 2% of the data required to be modified for high detection accuracy), and ability to trace data leaks. Our results also highlight the robustness and transferability of the framework, proving its practical applicability in detecting dataset abuse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18897v1-abstract-full').style.display = 'none'; document.getElementById('2409.18897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18601">arXiv:2409.18601</a> <span> [<a href="https://arxiv.org/pdf/2409.18601">pdf</a>, <a href="https://arxiv.org/format/2409.18601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving Quantum Annealing for Quadratic Unconstrained Binary Optimization (QUBO) Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+M">Moyang Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18601v1-abstract-short" style="display: inline;"> Quantum annealers offer a promising approach to solve Quadratic Unconstrained Binary Optimization (QUBO) problems, which have a wide range of applications. However, when a user submits its QUBO problem to a third-party quantum annealer, the problem itself may disclose the user's private information to the quantum annealing service provider. To mitigate this risk, we introduce a privacy-preserving… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18601v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18601v1-abstract-full" style="display: none;"> Quantum annealers offer a promising approach to solve Quadratic Unconstrained Binary Optimization (QUBO) problems, which have a wide range of applications. However, when a user submits its QUBO problem to a third-party quantum annealer, the problem itself may disclose the user's private information to the quantum annealing service provider. To mitigate this risk, we introduce a privacy-preserving QUBO framework and propose a novel solution method. Our approach employs a combination of digit-wise splitting and matrix permutation to obfuscate the QUBO problem's model matrix $Q$, effectively concealing the matrix elements. In addition, based on the solution to the obfuscated version of the QUBO problem, we can reconstruct the solution to the original problem with high accuracy. Theoretical analysis and empirical tests confirm the efficacy and efficiency of our proposed technique, demonstrating its potential for preserving user privacy in quantum annealing services. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18601v1-abstract-full').style.display = 'none'; document.getElementById('2409.18601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures, QCE 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03179">arXiv:2409.03179</a> <span> [<a href="https://arxiv.org/pdf/2409.03179">pdf</a>, <a href="https://arxiv.org/format/2409.03179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681512">10.1145/3664647.3681512 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Perceptual-Distortion Balanced Image Super-Resolution is a Multi-Objective Optimization Problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiwen Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanjie Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+S">Shilv Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liqun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiahuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Luxin Yan</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+X">Xu Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03179v1-abstract-short" style="display: inline;"> Training Single-Image Super-Resolution (SISR) models using pixel-based regression losses can achieve high distortion metrics scores (e.g., PSNR and SSIM), but often results in blurry images due to insufficient recovery of high-frequency details. Conversely, using GAN or perceptual losses can produce sharp images with high perceptual metric scores (e.g., LPIPS), but may introduce artifacts and inco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03179v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03179v1-abstract-full" style="display: none;"> Training Single-Image Super-Resolution (SISR) models using pixel-based regression losses can achieve high distortion metrics scores (e.g., PSNR and SSIM), but often results in blurry images due to insufficient recovery of high-frequency details. Conversely, using GAN or perceptual losses can produce sharp images with high perceptual metric scores (e.g., LPIPS), but may introduce artifacts and incorrect textures. Balancing these two types of losses can help achieve a trade-off between distortion and perception, but the challenge lies in tuning the loss function weights. To address this issue, we propose a novel method that incorporates Multi-Objective Optimization (MOO) into the training process of SISR models to balance perceptual quality and distortion. We conceptualize the relationship between loss weights and image quality assessment (IQA) metrics as black-box objective functions to be optimized within our Multi-Objective Bayesian Optimization Super-Resolution (MOBOSR) framework. This approach automates the hyperparameter tuning process, reduces overall computational cost, and enables the use of numerous loss functions simultaneously. Extensive experiments demonstrate that MOBOSR outperforms state-of-the-art methods in terms of both perceptual quality and distortion, significantly advancing the perception-distortion Pareto frontier. Our work points towards a new direction for future research on balancing perceptual quality and fidelity in nearly all image restoration tasks. The source code and pretrained models are available at: https://github.com/ZhuKeven/MOBOSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03179v1-abstract-full').style.display = 'none'; document.getElementById('2409.03179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02495">arXiv:2409.02495</a> <span> [<a href="https://arxiv.org/pdf/2409.02495">pdf</a>, <a href="https://arxiv.org/format/2409.02495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoAst: Validation-Free Contribution Assessment for Federated Learning based on Cross-Round Valuation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Likun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shucheng Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02495v1-abstract-short" style="display: inline;"> In the federated learning (FL) process, since the data held by each participant is different, it is necessary to figure out which participant has a higher contribution to the model performance. Effective contribution assessment can help motivate data owners to participate in the FL training. Research works in this field can be divided into two directions based on whether a validation dataset is re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02495v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02495v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02495v1-abstract-full" style="display: none;"> In the federated learning (FL) process, since the data held by each participant is different, it is necessary to figure out which participant has a higher contribution to the model performance. Effective contribution assessment can help motivate data owners to participate in the FL training. Research works in this field can be divided into two directions based on whether a validation dataset is required. Validation-based methods need to use representative validation data to measure the model accuracy, which is difficult to obtain in practical FL scenarios. Existing validation-free methods assess the contribution based on the parameters and gradients of local models and the global model in a single training round, which is easily compromised by the stochasticity of model training. In this work, we propose CoAst, a practical method to assess the FL participants' contribution without access to any validation data. The core idea of CoAst involves two aspects: one is to only count the most important part of model parameters through a weights quantization, and the other is a cross-round valuation based on the similarity between the current local parameters and the global parameter updates in several subsequent communication rounds. Extensive experiments show that CoAst has comparable assessment reliability to existing validation-based methods and outperforms existing validation-free methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02495v1-abstract-full').style.display = 'none'; document.getElementById('2409.02495v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10822">arXiv:2408.10822</a> <span> [<a href="https://arxiv.org/pdf/2408.10822">pdf</a>, <a href="https://arxiv.org/format/2408.10822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Navigating Spatio-Temporal Heterogeneity: A Graph Transformer Approach for Traffic Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianxiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+E">Erdong Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Siru Zhong</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10822v2-abstract-short" style="display: inline;"> Traffic forecasting has emerged as a crucial research area in the development of smart cities. Although various neural networks with intricate architectures have been developed to address this problem, they still face two key challenges: i) Recent advancements in network designs for modeling spatio-temporal correlations are starting to see diminishing returns in performance enhancements. ii) Addit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10822v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10822v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10822v2-abstract-full" style="display: none;"> Traffic forecasting has emerged as a crucial research area in the development of smart cities. Although various neural networks with intricate architectures have been developed to address this problem, they still face two key challenges: i) Recent advancements in network designs for modeling spatio-temporal correlations are starting to see diminishing returns in performance enhancements. ii) Additionally, most models do not account for the spatio-temporal heterogeneity inherent in traffic data, i.e., traffic distribution varies significantly across different regions and traffic flow patterns fluctuate across various time slots. To tackle these challenges, we introduce the Spatio-Temporal Graph Transformer (STGormer), which effectively integrates attribute and structure information inherent in traffic data for learning spatio-temporal correlations, and a mixture-of-experts module for capturing heterogeneity along spaital and temporal axes. Specifically, we design two straightforward yet effective spatial encoding methods based on the graph structure and integrate time position encoding into the vanilla transformer to capture spatio-temporal traffic patterns. Additionally, a mixture-of-experts enhanced feedforward neural network (FNN) module adaptively assigns suitable expert layers to distinct patterns via a spatio-temporal gating network, further improving overall prediction accuracy. Experiments on real-world traffic datasets demonstrate that STGormer achieves state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10822v2-abstract-full').style.display = 'none'; document.getElementById('2408.10822v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10450">arXiv:2408.10450</a> <span> [<a href="https://arxiv.org/pdf/2408.10450">pdf</a>, <a href="https://arxiv.org/format/2408.10450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RUMI: Rummaging Using Mutual Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Fazeli%2C+N">Nima Fazeli</a>, <a href="/search/cs?searchtype=author&query=Berenson%2C+D">Dmitry Berenson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10450v1-abstract-short" style="display: inline;"> This paper presents Rummaging Using Mutual Information (RUMI), a method for online generation of robot action sequences to gather information about the pose of a known movable object in visually-occluded environments. Focusing on contact-rich rummaging, our approach leverages mutual information between the object pose distribution and robot trajectory for action planning. From an observed partial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10450v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10450v1-abstract-full" style="display: none;"> This paper presents Rummaging Using Mutual Information (RUMI), a method for online generation of robot action sequences to gather information about the pose of a known movable object in visually-occluded environments. Focusing on contact-rich rummaging, our approach leverages mutual information between the object pose distribution and robot trajectory for action planning. From an observed partial point cloud, RUMI deduces the compatible object pose distribution and approximates the mutual information of it with workspace occupancy in real time. Based on this, we develop an information gain cost function and a reachability cost function to keep the object within the robot's reach. These are integrated into a model predictive control (MPC) framework with a stochastic dynamics model, updating the pose distribution in a closed loop. Key contributions include a new belief framework for object pose estimation, an efficient information gain computation strategy, and a robust MPC-based control scheme. RUMI demonstrates superior performance in both simulated and real tasks compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10450v1-abstract-full').style.display = 'none'; document.getElementById('2408.10450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 17 figures, submitted to IEEE Transactions on Robotics (T-RO)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10284">arXiv:2408.10284</a> <span> [<a href="https://arxiv.org/pdf/2408.10284">pdf</a>, <a href="https://arxiv.org/format/2408.10284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3676536.3676741">10.1145/3676536.3676741 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AdapMoE: Adaptive Sensitivity-based Expert Gating and Management for Efficient MoE Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shuzhang Zhong</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Ling Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ru Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10284v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) models are designed to enhance the efficiency of large language models (LLMs) without proportionally increasing the computational demands. However, their deployment on edge devices still faces significant challenges due to high on-demand loading overheads from managing sparsely activated experts. This paper introduces AdapMoE, an algorithm-system co-design framework for ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10284v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10284v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) models are designed to enhance the efficiency of large language models (LLMs) without proportionally increasing the computational demands. However, their deployment on edge devices still faces significant challenges due to high on-demand loading overheads from managing sparsely activated experts. This paper introduces AdapMoE, an algorithm-system co-design framework for efficient MoE inference. AdapMoE features adaptive expert gating and management to reduce the on-demand loading overheads. We observe the heterogeneity of experts loading across layers and tokens, based on which we propose a sensitivity-based strategy to adjust the number of activated experts dynamically. Meanwhile, we also integrate advanced prefetching and cache management techniques to further reduce the loading latency. Through comprehensive evaluations on various platforms, we demonstrate AdapMoE consistently outperforms existing techniques, reducing the average number of activated experts by 25% and achieving a 1.35x speedup without accuracy degradation. Code is available at: https://github.com/PKU-SEC-Lab/AdapMoE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10284v1-abstract-full').style.display = 'none'; document.getElementById('2408.10284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08323">arXiv:2408.08323</a> <span> [<a href="https://arxiv.org/pdf/2408.08323">pdf</a>, <a href="https://arxiv.org/format/2408.08323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring Urban Comfort through Novel Wearables and Environmental Surveys </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chwalek%2C+P">Patrick Chwalek</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sailin Zhong</a>, <a href="/search/cs?searchtype=author&query=Perry%2C+N">Nathan Perry</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&query=Miller%2C+C">Clayton Miller</a>, <a href="/search/cs?searchtype=author&query=Alavi%2C+H+S">Hamed Seiied Alavi</a>, <a href="/search/cs?searchtype=author&query=Lalanne%2C+D">Denis Lalanne</a>, <a href="/search/cs?searchtype=author&query=Paradiso%2C+J+A">Joseph A. Paradiso</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08323v1-abstract-short" style="display: inline;"> This study presents a comprehensive dataset capturing indoor environmental parameters, physiological responses, and subjective perceptions across three global cities. Utilizing wearable sensors, including smart eyeglasses, and a modified Cozie app, environmental and physiological data were collected, along with pre-screening, onboarding, and recurring surveys. Peripheral cues facilitated participa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08323v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08323v1-abstract-full" style="display: none;"> This study presents a comprehensive dataset capturing indoor environmental parameters, physiological responses, and subjective perceptions across three global cities. Utilizing wearable sensors, including smart eyeglasses, and a modified Cozie app, environmental and physiological data were collected, along with pre-screening, onboarding, and recurring surveys. Peripheral cues facilitated participant engagement with micro-EMA surveys, minimizing disruption over a 5-day collection period. The dataset offers insights into urban comfort dynamics, highlighting the interplay between environmental conditions, physiological responses, and subjective perceptions. Researchers can utilize this dataset to deepen their understanding of indoor environmental quality and inform the design of healthier built environments. Access to this dataset can advance indoor environmental research and contribute to the creation of more comfortable and sustainable indoor spaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08323v1-abstract-full').style.display = 'none'; document.getElementById('2408.08323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Nature Scientific Data</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05944">arXiv:2408.05944</a> <span> [<a href="https://arxiv.org/pdf/2408.05944">pdf</a>, <a href="https://arxiv.org/format/2408.05944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty Quantification of Spectral Estimator and MLE for Orthogonal Group Synchronization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+Z+S">Ziliang Samuel Zhong</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+S">Shuyang Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05944v1-abstract-short" style="display: inline;"> Orthogonal group synchronization aims to recover orthogonal group elements from their noisy pairwise measurements. It has found numerous applications including computer vision, imaging science, and community detection. Due to the orthogonal constraints, it is often challenging to find the least squares estimator in presence of noise. In the recent years, semidefinite relaxation (SDR) and spectral… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05944v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05944v1-abstract-full" style="display: none;"> Orthogonal group synchronization aims to recover orthogonal group elements from their noisy pairwise measurements. It has found numerous applications including computer vision, imaging science, and community detection. Due to the orthogonal constraints, it is often challenging to find the least squares estimator in presence of noise. In the recent years, semidefinite relaxation (SDR) and spectral methods have proven to be powerful tools in recovering the group elements. In particular, under additive Gaussian noise, the SDR exactly produces the maximum likelihood estimator (MLE), and both MLE and spectral methods are able to achieve near-optimal statistical error. In this work, we take one step further to quantify the uncertainty of the MLE and spectral estimators by considering their distributions. By leveraging the orthogonality constraints in the likelihood function, we obtain a second-order expansion of the MLE and spectral estimator with the leading terms as an anti-symmetric Gaussian random matrix that is on the tangent space of the orthogonal matrix. This also implies state-of-the-art min-max risk bounds as a by-product. Our works provide a general theoretical framework that is potentially useful to find an approximate distribution of the estimators arising from many statistical inference problems with manifold constraints. The numerical experiments confirm our theoretical contribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05944v1-abstract-full').style.display = 'none'; document.getElementById('2408.05944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03511">arXiv:2408.03511</a> <span> [<a href="https://arxiv.org/pdf/2408.03511">pdf</a>, <a href="https://arxiv.org/format/2408.03511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MoExtend: Tuning New Experts for Modality and Task Extension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shanshan Zhong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shanghua Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongzhan Huang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wushao Wen</a>, <a href="/search/cs?searchtype=author&query=Zitnik%2C+M">Marinka Zitnik</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03511v1-abstract-short" style="display: inline;"> Large language models (LLMs) excel in various tasks but are primarily trained on text data, limiting their application scope. Expanding LLM capabilities to include vision-language understanding is vital, yet training them on multimodal data from scratch is challenging and costly. Existing instruction tuning methods, e.g., LLAVA, often connects a pretrained CLIP vision encoder and LLMs via fully fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03511v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03511v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03511v1-abstract-full" style="display: none;"> Large language models (LLMs) excel in various tasks but are primarily trained on text data, limiting their application scope. Expanding LLM capabilities to include vision-language understanding is vital, yet training them on multimodal data from scratch is challenging and costly. Existing instruction tuning methods, e.g., LLAVA, often connects a pretrained CLIP vision encoder and LLMs via fully fine-tuning LLMs to bridge the modality gap. However, full fine-tuning is plagued by catastrophic forgetting, i.e., forgetting previous knowledge, and high training costs particularly in the era of increasing tasks and modalities. To solve this issue, we introduce MoExtend, an effective framework designed to streamline the modality adaptation and extension of Mixture-of-Experts (MoE) models. MoExtend seamlessly integrates new experts into pre-trained MoE models, endowing them with novel knowledge without the need to tune pretrained models such as MoE and vision encoders. This approach enables rapid adaptation and extension to new modal data or tasks, effectively addressing the challenge of accommodating new modalities within LLMs. Furthermore, MoExtend avoids tuning pretrained models, thus mitigating the risk of catastrophic forgetting. Experimental results demonstrate the efficacy and efficiency of MoExtend in enhancing the multimodal capabilities of LLMs, contributing to advancements in multimodal AI research. Code: https://github.com/zhongshsh/MoExtend. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03511v1-abstract-full').style.display = 'none'; document.getElementById('2408.03511v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 - SRW</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20508">arXiv:2407.20508</a> <span> [<a href="https://arxiv.org/pdf/2407.20508">pdf</a>, <a href="https://arxiv.org/format/2407.20508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Potential of Spiking Dynamics in Graph Representation Learning through Spatial-Temporal Normalization and Coding Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingkun Xu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Huifeng Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yujie Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Faqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+J">Jing Pei</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shuai Zhong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Lei Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20508v1-abstract-short" style="display: inline;"> In recent years, spiking neural networks (SNNs) have attracted substantial interest due to their potential to replicate the energy-efficient and event-driven processing of biological neurons. Despite this, the application of SNNs in graph representation learning, particularly for non-Euclidean data, remains underexplored, and the influence of spiking dynamics on graph learning is not yet fully und… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20508v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20508v1-abstract-full" style="display: none;"> In recent years, spiking neural networks (SNNs) have attracted substantial interest due to their potential to replicate the energy-efficient and event-driven processing of biological neurons. Despite this, the application of SNNs in graph representation learning, particularly for non-Euclidean data, remains underexplored, and the influence of spiking dynamics on graph learning is not yet fully understood. This work seeks to address these gaps by examining the unique properties and benefits of spiking dynamics in enhancing graph representation learning. We propose a spike-based graph neural network model that incorporates spiking dynamics, enhanced by a novel spatial-temporal feature normalization (STFN) technique, to improve training efficiency and model stability. Our detailed analysis explores the impact of rate coding and temporal coding on SNN performance, offering new insights into their advantages for deep graph networks and addressing challenges such as the oversmoothing problem. Experimental results demonstrate that our SNN models can achieve competitive performance with state-of-the-art graph neural networks (GNNs) while considerably reducing computational costs, highlighting the potential of SNNs for efficient neuromorphic computing applications in complex graph-based scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20508v1-abstract-full').style.display = 'none'; document.getElementById('2407.20508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19828">arXiv:2407.19828</a> <span> [<a href="https://arxiv.org/pdf/2407.19828">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning based Latent Factorization of Tensors for Privacy-Preserving QoS Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shuai Zhong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zengtong Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19828v1-abstract-short" style="display: inline;"> In applications related to big data and service computing, dynamic connections tend to be encountered, especially the dynamic data of user-perspective quality of service (QoS) in Web services. They are transformed into high-dimensional and incomplete (HDI) tensors which include abundant temporal pattern information. Latent factorization of tensors (LFT) is an extremely efficient and typical approa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19828v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19828v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19828v1-abstract-full" style="display: none;"> In applications related to big data and service computing, dynamic connections tend to be encountered, especially the dynamic data of user-perspective quality of service (QoS) in Web services. They are transformed into high-dimensional and incomplete (HDI) tensors which include abundant temporal pattern information. Latent factorization of tensors (LFT) is an extremely efficient and typical approach for extracting such patterns from an HDI tensor. However, current LFT models require the QoS data to be maintained in a central place (e.g., a central server), which is impossible for increasingly privacy-sensitive users. To address this problem, this article creatively designs a federated learning based on latent factorization of tensors (FL-LFT). It builds a data-density -oriented federated learning model to enable isolated users to collaboratively train a global LFT model while protecting user's privacy. Extensive experiments on a QoS dataset collected from the real world verify that FL-LFT shows a remarkable increase in prediction accuracy when compared to state-of-the-art federated learning (FL) approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19828v1-abstract-full').style.display = 'none'; document.getElementById('2407.19828v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12827">arXiv:2407.12827</a> <span> [<a href="https://arxiv.org/pdf/2407.12827">pdf</a>, <a href="https://arxiv.org/format/2407.12827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Solution for The PST-KDD-2024 OAG-Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shupeng Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinger Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shushan Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12827v1-abstract-short" style="display: inline;"> In this paper, we introduce the second-place solution in the KDD-2024 OAG-Challenge paper source tracing track. Our solution is mainly based on two methods, BERT and GCN, and combines the reasoning results of BERT and GCN in the final submission to achieve complementary performance. In the BERT solution, we focus on processing the fragments that appear in the references of the paper, and use a var… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12827v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12827v1-abstract-full" style="display: none;"> In this paper, we introduce the second-place solution in the KDD-2024 OAG-Challenge paper source tracing track. Our solution is mainly based on two methods, BERT and GCN, and combines the reasoning results of BERT and GCN in the final submission to achieve complementary performance. In the BERT solution, we focus on processing the fragments that appear in the references of the paper, and use a variety of operations to reduce the redundant interference in the fragments, so that the information received by BERT is more refined. In the GCN solution, we map information such as paper fragments, abstracts, and titles to a high-dimensional semantic space through an embedding model, and try to build edges between titles, abstracts, and fragments to integrate contextual relationships for judgment. In the end, our solution achieved a remarkable score of 0.47691 in the competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12827v1-abstract-full').style.display = 'none'; document.getElementById('2407.12827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12341">arXiv:2407.12341</a> <span> [<a href="https://arxiv.org/pdf/2407.12341">pdf</a>, <a href="https://arxiv.org/format/2407.12341">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> LLM-based query paraphrasing for video search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaxin Wu</a>, <a href="/search/cs?searchtype=author&query=Ngo%2C+C">Chong-Wah Ngo</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+W">Wing-Kwong Chan</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng-Hua Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12341v1-abstract-short" style="display: inline;"> Text-to-video retrieval answers user queries through search by concepts and embeddings. Limited by the size of the concept bank and the amount of training data, answering queries in the wild is not always effective due to the out-of-vocabulary problem. Furthermore, neither concept-based nor embedding-based search can perform reasoning to consolidate the search results for complex queries mixed wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12341v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12341v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12341v1-abstract-full" style="display: none;"> Text-to-video retrieval answers user queries through search by concepts and embeddings. Limited by the size of the concept bank and the amount of training data, answering queries in the wild is not always effective due to the out-of-vocabulary problem. Furthermore, neither concept-based nor embedding-based search can perform reasoning to consolidate the search results for complex queries mixed with logical and spatial constraints. To address these problems, we leverage large language models (LLM) to paraphrase the query by text-to-text (T2T), text-to-image (T2I), and image-to-text (I2T) transformations. These transformations rephrase abstract concepts into simple words to address the out-of-vocabulary problem. Furthermore, the complex relationship in a query can be decoupled into simpler sub-queries, yielding better retrieval performance when fusing the search results of these sub-queries. To address the LLM hallucination problem, this paper also proposes a novel consistency-based verification strategy to filter the paraphrased queries that are factually incorrect. Extensive experiments are conducted for ad-hoc video search and known-item search on the TRECVid datasets. We provide empirical insights into how traditionally difficult-to-answer queries can be resolved by query paraphrasing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12341v1-abstract-full').style.display = 'none'; document.getElementById('2407.12341v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11377">arXiv:2407.11377</a> <span> [<a href="https://arxiv.org/pdf/2407.11377">pdf</a>, <a href="https://arxiv.org/format/2407.11377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Environment-Aware Robotic Arm Reaching Based on a Bio-Inspired Neurodynamical Computational Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chatziparaschis%2C+D">Dimitrios Chatziparaschis</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shan Zhong</a>, <a href="/search/cs?searchtype=author&query=Christopoulos%2C+V">Vasileios Christopoulos</a>, <a href="/search/cs?searchtype=author&query=Karydis%2C+K">Konstantinos Karydis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11377v1-abstract-short" style="display: inline;"> Bio-inspired robotic systems are capable of adaptive learning, scalable control, and efficient information processing. Enabling real-time decision-making for such systems is critical to respond to dynamic changes in the environment. We focus on dynamic target tracking in open areas using a robotic six-degree-of-freedom manipulator with a bird-eye view camera for visual feedback, and by deploying t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11377v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11377v1-abstract-full" style="display: none;"> Bio-inspired robotic systems are capable of adaptive learning, scalable control, and efficient information processing. Enabling real-time decision-making for such systems is critical to respond to dynamic changes in the environment. We focus on dynamic target tracking in open areas using a robotic six-degree-of-freedom manipulator with a bird-eye view camera for visual feedback, and by deploying the Neurodynamical Computational Framework (NeuCF). NeuCF is a recently developed bio-inspired model for target tracking based on Dynamic Neural Fields (DNFs) and Stochastic Optimal Control (SOC) theory. It has been trained for reaching actions on a planar surface toward localized visual beacons, and it can re-target or generate stop signals on the fly based on changes in the environment (e.g., a new target has emerged, or an existing one has been removed). We evaluated our system over various target-reaching scenarios. In all experiments, NeuCF had high end-effector positional accuracy, generated smooth trajectories, and provided reduced path lengths compared with a baseline cubic polynomial trajectory generator. In all, the developed system offers a robust and dynamic-aware robotic manipulation approach that affords real-time decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11377v1-abstract-full').style.display = 'none'; document.getElementById('2407.11377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 6 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10226">arXiv:2407.10226</a> <span> [<a href="https://arxiv.org/pdf/2407.10226">pdf</a>, <a href="https://arxiv.org/format/2407.10226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Addressing Domain Discrepancy: A Dual-branch Collaborative Model to Unsupervised Dehazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+S">Shuaibin Fan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minglong Xue</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+A">Aoxiang Ning</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Senming Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10226v1-abstract-short" style="display: inline;"> Although synthetic data can alleviate acquisition challenges in image dehazing tasks, it also introduces the problem of domain bias when dealing with small-scale data. This paper proposes a novel dual-branch collaborative unpaired dehazing model (DCM-dehaze) to address this issue. The proposed method consists of two collaborative branches: dehazing and contour constraints. Specifically, we design… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10226v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10226v1-abstract-full" style="display: none;"> Although synthetic data can alleviate acquisition challenges in image dehazing tasks, it also introduces the problem of domain bias when dealing with small-scale data. This paper proposes a novel dual-branch collaborative unpaired dehazing model (DCM-dehaze) to address this issue. The proposed method consists of two collaborative branches: dehazing and contour constraints. Specifically, we design a dual depthwise separable convolutional module (DDSCM) to enhance the information expressiveness of deeper features and the correlation to shallow features. In addition, we construct a bidirectional contour function to optimize the edge features of the image to enhance the clarity and fidelity of the image details. Furthermore, we present feature enhancers via a residual dense architecture to eliminate redundant features of the dehazing process and further alleviate the domain deviation problem. Extensive experiments on benchmark datasets show that our method reaches the state-of-the-art. This project code will be available at \url{https://github.com/Fan-pixel/DCM-dehaze. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10226v1-abstract-full').style.display = 'none'; document.getElementById('2407.10226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05411">arXiv:2407.05411</a> <span> [<a href="https://arxiv.org/pdf/2407.05411">pdf</a>, <a href="https://arxiv.org/format/2407.05411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Assessing Code Generation with Intermediate Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xun Deng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sicheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Honghua Dong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingyu Hu</a>, <a href="/search/cs?searchtype=author&query=Beillahi%2C+S+M">Sidi Mohamed Beillahi</a>, <a href="/search/cs?searchtype=author&query=Si%2C+X">Xujie Si</a>, <a href="/search/cs?searchtype=author&query=Long%2C+F">Fan Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05411v1-abstract-short" style="display: inline;"> Intermediate step methodologies like chain of thoughts (COT) have demonstrated effectiveness in enhancing the performance of Large Language Models (LLMs) on code generation. This study explores the utilization of intermediate languages, including various programming languages, natural language solutions, and pseudo-code, and systematically evaluates their impact on the performance of LLMs in code… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05411v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05411v1-abstract-full" style="display: none;"> Intermediate step methodologies like chain of thoughts (COT) have demonstrated effectiveness in enhancing the performance of Large Language Models (LLMs) on code generation. This study explores the utilization of intermediate languages, including various programming languages, natural language solutions, and pseudo-code, and systematically evaluates their impact on the performance of LLMs in code generation tasks. Our experiments encompass eleven models across the CodeLlama, GPT, and Mistral families, as well as newly released smaller models. Our findings reveal that intermediate languages generally exhibit greater efficacy in larger models that have not yet achieved state-of-the-art performance. Natural language consistently emerges as the most effective intermediate representation across all target languages. However, we observe no universally effective intermediate formal language across different models and target languages. Furthermore, we uncover a weak correlation between the correctness of intermediate solutions and final generation, suggesting that improvements may stem from the chain-of-thought effect rather than language-specific transfer. Interestingly, we discover that for GPT family models, prompting multiple times without explicit self-correction instructions yields performance gains across the studied languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05411v1-abstract-full').style.display = 'none'; document.getElementById('2407.05411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01527">arXiv:2407.01527</a> <span> [<a href="https://arxiv.org/pdf/2407.01527">pdf</a>, <a href="https://arxiv.org/format/2407.01527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KV Cache Compression, But What Must We Give in Return? A Comprehensive Benchmark of Long Context Capable Approaches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiayi Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shaochen Zhong</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+Y">Yu-Neng Chuang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Songchen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanchu Wang</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duy Le</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hongye Jin</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+V">Vipin Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaozhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01527v2-abstract-short" style="display: inline;"> Long context capability is a crucial competency for large language models (LLMs) as it mitigates the human struggle to digest long-form texts. This capability enables complex task-solving scenarios such as book summarization, code assistance, and many more tasks that are traditionally manpower-intensive. However, transformer-based LLMs face significant challenges with long context input due to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01527v2-abstract-full').style.display = 'inline'; document.getElementById('2407.01527v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01527v2-abstract-full" style="display: none;"> Long context capability is a crucial competency for large language models (LLMs) as it mitigates the human struggle to digest long-form texts. This capability enables complex task-solving scenarios such as book summarization, code assistance, and many more tasks that are traditionally manpower-intensive. However, transformer-based LLMs face significant challenges with long context input due to the growing size of the KV cache and the intrinsic complexity of attending to extended inputs; where multiple schools of efficiency-driven approaches - such as KV cache quantization, token dropping, prompt compression, linear-time sequence models, and hybrid architectures - have been proposed to produce efficient yet long context-capable models. Despite these advancements, no existing work has comprehensively benchmarked these methods in a reasonably aligned environment. In this work, we fill this gap by providing a taxonomy of current methods and evaluating 10+ state-of-the-art approaches across seven categories of long context tasks. Our work reveals numerous previously unknown phenomena and offers insights - as well as a friendly workbench - for the future development of long context-capable LLMs. The source code is available at https://github.com/henryzhongsc/longctx_bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01527v2-abstract-full').style.display = 'none'; document.getElementById('2407.01527v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19466">arXiv:2406.19466</a> <span> [<a href="https://arxiv.org/pdf/2406.19466">pdf</a>, <a href="https://arxiv.org/format/2406.19466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3658644.3670298">10.1145/3658644.3670298 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Data Poisoning Attacks to Locally Differentially Private Frequent Itemset Mining Protocols </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wei Tong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jiacheng Niu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19466v1-abstract-short" style="display: inline;"> Local differential privacy (LDP) provides a way for an untrusted data collector to aggregate users' data without violating their privacy. Various privacy-preserving data analysis tasks have been studied under the protection of LDP, such as frequency estimation, frequent itemset mining, and machine learning. Despite its privacy-preserving properties, recent research has demonstrated the vulnerabili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19466v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19466v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19466v1-abstract-full" style="display: none;"> Local differential privacy (LDP) provides a way for an untrusted data collector to aggregate users' data without violating their privacy. Various privacy-preserving data analysis tasks have been studied under the protection of LDP, such as frequency estimation, frequent itemset mining, and machine learning. Despite its privacy-preserving properties, recent research has demonstrated the vulnerability of certain LDP protocols to data poisoning attacks. However, existing data poisoning attacks are focused on basic statistics under LDP, such as frequency estimation and mean/variance estimation. As an important data analysis task, the security of LDP frequent itemset mining has yet to be thoroughly examined. In this paper, we aim to address this issue by presenting novel and practical data poisoning attacks against LDP frequent itemset mining protocols. By introducing a unified attack framework with composable attack operations, our data poisoning attack can successfully manipulate the state-of-the-art LDP frequent itemset mining protocols and has the potential to be adapted to other protocols with similar structures. We conduct extensive experiments on three datasets to compare the proposed attack with four baseline attacks. The results demonstrate the severity of the threat and the effectiveness of the proposed attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19466v1-abstract-full').style.display = 'none'; document.getElementById('2406.19466v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ACM Conference on Computer and Communications Security (ACM CCS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16307">arXiv:2406.16307</a> <span> [<a href="https://arxiv.org/pdf/2406.16307">pdf</a>, <a href="https://arxiv.org/format/2406.16307">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Artistic-style text detector and a new Movie-Poster dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ning%2C+A">Aoxiang Ning</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yiting Wei</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minglong Xue</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Senming Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16307v1-abstract-short" style="display: inline;"> Although current text detection algorithms demonstrate effectiveness in general scenarios, their performance declines when confronted with artistic-style text featuring complex structures. This paper proposes a method that utilizes Criss-Cross Attention and residual dense block to address the incomplete and misdiagnosis of artistic-style text detection by current algorithms. Specifically, our meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16307v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16307v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16307v1-abstract-full" style="display: none;"> Although current text detection algorithms demonstrate effectiveness in general scenarios, their performance declines when confronted with artistic-style text featuring complex structures. This paper proposes a method that utilizes Criss-Cross Attention and residual dense block to address the incomplete and misdiagnosis of artistic-style text detection by current algorithms. Specifically, our method mainly consists of a feature extraction backbone, a feature enhancement network, a multi-scale feature fusion module, and a boundary discrimination module. The feature enhancement network significantly enhances the model's perceptual capabilities in complex environments by fusing horizontal and vertical contextual information, allowing it to capture detailed features overlooked in artistic-style text. We incorporate residual dense block into the Feature Pyramid Network to suppress the effect of background noise during feature fusion. Aiming to omit the complex post-processing, we explore a boundary discrimination module that guides the correct generation of boundary proposals. Furthermore, given that movie poster titles often use stylized art fonts, we collected a Movie-Poster dataset to address the scarcity of artistic-style text data. Extensive experiments demonstrate that our proposed method performs superiorly on the Movie-Poster dataset and produces excellent results on multiple benchmark datasets. The code and the Movie-Poster dataset will be available at: https://github.com/biedaxiaohua/Artistic-style-text-detection <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16307v1-abstract-full').style.display = 'none'; document.getElementById('2406.16307v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16062">arXiv:2406.16062</a> <span> [<a href="https://arxiv.org/pdf/2406.16062">pdf</a>, <a href="https://arxiv.org/format/2406.16062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Towards Biologically Plausible Computing: A Comprehensive Comparison </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+C">Changze Lv</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yufei Gu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhengkang Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhibo Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixin Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Feiran Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenghua Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Ruicheng Yin</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yu Shang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Siqi Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaohua Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Muling Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenhao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianlong Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cenyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zixuan Ling</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaoqing Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16062v1-abstract-short" style="display: inline;"> Backpropagation is a cornerstone algorithm in training neural networks for supervised learning, which uses a gradient descent method to update network weights by minimizing the discrepancy between actual and desired outputs. Despite its pivotal role in propelling deep learning advancements, the biological plausibility of backpropagation is questioned due to its requirements for weight symmetry, gl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16062v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16062v1-abstract-full" style="display: none;"> Backpropagation is a cornerstone algorithm in training neural networks for supervised learning, which uses a gradient descent method to update network weights by minimizing the discrepancy between actual and desired outputs. Despite its pivotal role in propelling deep learning advancements, the biological plausibility of backpropagation is questioned due to its requirements for weight symmetry, global error computation, and dual-phase training. To address this long-standing challenge, many studies have endeavored to devise biologically plausible training algorithms. However, a fully biologically plausible algorithm for training multilayer neural networks remains elusive, and interpretations of biological plausibility vary among researchers. In this study, we establish criteria for biological plausibility that a desirable learning algorithm should meet. Using these criteria, we evaluate a range of existing algorithms considered to be biologically plausible, including Hebbian learning, spike-timing-dependent plasticity, feedback alignment, target propagation, predictive coding, forward-forward algorithm, perturbation learning, local losses, and energy-based learning. Additionally, we empirically evaluate these algorithms across diverse network architectures and datasets. We compare the feature representations learned by these algorithms with brain activity recorded by non-invasive devices under identical stimuli, aiming to identify which algorithm can most accurately replicate brain activity patterns. We are hopeful that this study could inspire the development of new biologically plausible algorithms for training multilayer networks, thereby fostering progress in both the fields of neuroscience and machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16062v1-abstract-full').style.display = 'none'; document.getElementById('2406.16062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14318">arXiv:2406.14318</a> <span> [<a href="https://arxiv.org/pdf/2406.14318">pdf</a>, <a href="https://arxiv.org/format/2406.14318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Fire Thief Is Also the Keeper: Balancing Usability and Privacy in Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhili Shen</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Z">Zihang Xi</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Ying He</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wei Tong</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jingyu Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14318v1-abstract-short" style="display: inline;"> The rapid adoption of online chatbots represents a significant advancement in artificial intelligence. However, this convenience brings considerable privacy concerns, as prompts can inadvertently contain sensitive information exposed to large language models (LLMs). Limited by high computational costs, reduced task usability, and excessive system modifications, previous works based on local deploy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14318v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14318v1-abstract-full" style="display: none;"> The rapid adoption of online chatbots represents a significant advancement in artificial intelligence. However, this convenience brings considerable privacy concerns, as prompts can inadvertently contain sensitive information exposed to large language models (LLMs). Limited by high computational costs, reduced task usability, and excessive system modifications, previous works based on local deployment, embedding perturbation, and homomorphic encryption are inapplicable to online prompt-based LLM applications. To address these issues, this paper introduces Prompt Privacy Sanitizer (i.e., ProSan), an end-to-end prompt privacy protection framework that can produce anonymized prompts with contextual privacy removed while maintaining task usability and human readability. It can also be seamlessly integrated into the online LLM service pipeline. To achieve high usability and dynamic anonymity, ProSan flexibly adjusts its protection targets and strength based on the importance of the words and the privacy leakage risk of the prompts. Additionally, ProSan is capable of adapting to diverse computational resource conditions, ensuring privacy protection even for mobile devices with limited computing power. Our experiments demonstrate that ProSan effectively removes private information across various tasks, including question answering, text summarization, and code generation, with minimal reduction in task performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14318v1-abstract-full').style.display = 'none'; document.getElementById('2406.14318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14106">arXiv:2406.14106</a> <span> [<a href="https://arxiv.org/pdf/2406.14106">pdf</a>, <a href="https://arxiv.org/format/2406.14106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EasyECR: A Library for Easy Implementation and Evaluation of Event Coreference Resolution Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuncong Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianhua Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng-hua Zhong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haiqin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14106v1-abstract-short" style="display: inline;"> Event Coreference Resolution (ECR) is the task of clustering event mentions that refer to the same real-world event. Despite significant advancements, ECR research faces two main challenges: limited generalizability across domains due to narrow dataset evaluations, and difficulties in comparing models within diverse ECR pipelines. To address these issues, we develop EasyECR, the first open-source… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14106v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14106v1-abstract-full" style="display: none;"> Event Coreference Resolution (ECR) is the task of clustering event mentions that refer to the same real-world event. Despite significant advancements, ECR research faces two main challenges: limited generalizability across domains due to narrow dataset evaluations, and difficulties in comparing models within diverse ECR pipelines. To address these issues, we develop EasyECR, the first open-source library designed to standardize data structures and abstract ECR pipelines for easy implementation and fair evaluation. More specifically, EasyECR integrates seven representative pipelines and ten popular benchmark datasets, enabling model evaluations on various datasets and promoting the development of robust ECR pipelines. By conducting extensive evaluation via our EasyECR, we find that, \lowercase\expandafter{\romannumeral1}) the representative ECR pipelines cannot generalize across multiple datasets, hence evaluating ECR pipelines on multiple datasets is necessary, \lowercase\expandafter{\romannumeral2}) all models in ECR pipelines have a great effect on pipeline performance, therefore, when one model in ECR pipelines are compared, it is essential to ensure that the other models remain consistent. Additionally, reproducing ECR results is not trivial, and the developed library can help reduce this discrepancy. The experimental results provide valuable baselines for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14106v1-abstract-full').style.display = 'none'; document.getElementById('2406.14106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10590">arXiv:2406.10590</a> <span> [<a href="https://arxiv.org/pdf/2406.10590">pdf</a>, <a href="https://arxiv.org/format/2406.10590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> LLM-Mediated Domain-Specific Voice Agents: The Case of TextileBot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shu Zhong</a>, <a href="/search/cs?searchtype=author&query=Gatti%2C+E">Elia Gatti</a>, <a href="/search/cs?searchtype=author&query=Hardwick%2C+J">James Hardwick</a>, <a href="/search/cs?searchtype=author&query=Ribul%2C+M">Miriam Ribul</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y">Youngjun Cho</a>, <a href="/search/cs?searchtype=author&query=Obrist%2C+M">Marianna Obrist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10590v1-abstract-short" style="display: inline;"> Developing domain-specific conversational agents (CAs) has been challenged by the need for extensive domain-focused data. Recent advancements in Large Language Models (LLMs) make them a viable option as a knowledge backbone. LLMs behaviour can be enhanced through prompting, instructing them to perform downstream tasks in a zero-shot fashion (i.e. without training). To this end, we incorporated str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10590v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10590v1-abstract-full" style="display: none;"> Developing domain-specific conversational agents (CAs) has been challenged by the need for extensive domain-focused data. Recent advancements in Large Language Models (LLMs) make them a viable option as a knowledge backbone. LLMs behaviour can be enhanced through prompting, instructing them to perform downstream tasks in a zero-shot fashion (i.e. without training). To this end, we incorporated structural knowledge into prompts and used prompted LLMs to build domain-specific voice-based CAs. We demonstrate this approach for the specific domain of textile circularity in form of the design, development, and evaluation of TextileBot. We present the design and development of the voice agent TextileBot and also the insights from an in-person user study (N=30) evaluating three variations of TextileBots. We analyse the human-agent interactions, combining quantitative and qualitative methods. Our results suggest that participants engaged in multi-turn conversations, and their perceptions of the three variation agents and respective interactions varied demonstrating the effectiveness of our prompt-based LLM approach. We discuss the dynamics of these interactions and their implications for designing future voice-based CAs. The results show that our method's potential for building domain-specific CAs. Furthermore, most participants engaged in multi-turn conversations, and their perceptions of the three voice agents and respective interactions varied demonstrating the effectiveness of our prompt-based LLM approach. We discuss the dynamics of these interactions and their implications for designing future voice-based CAs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10590v1-abstract-full').style.display = 'none'; document.getElementById('2406.10590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06587">arXiv:2406.06587</a> <span> [<a href="https://arxiv.org/pdf/2406.06587">pdf</a>, <a href="https://arxiv.org/format/2406.06587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring Human-AI Perception Alignment in Sensory Experiences: Do LLMs Understand Textile Hand? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shu Zhong</a>, <a href="/search/cs?searchtype=author&query=Gatti%2C+E">Elia Gatti</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y">Youngjun Cho</a>, <a href="/search/cs?searchtype=author&query=Obrist%2C+M">Marianna Obrist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06587v1-abstract-short" style="display: inline;"> Aligning large language models (LLMs) behaviour with human intent is critical for future AI. An important yet often overlooked aspect of this alignment is the perceptual alignment. Perceptual modalities like touch are more multifaceted and nuanced compared to other sensory modalities such as vision. This work investigates how well LLMs align with human touch experiences using the "textile hand" ta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06587v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06587v1-abstract-full" style="display: none;"> Aligning large language models (LLMs) behaviour with human intent is critical for future AI. An important yet often overlooked aspect of this alignment is the perceptual alignment. Perceptual modalities like touch are more multifaceted and nuanced compared to other sensory modalities such as vision. This work investigates how well LLMs align with human touch experiences using the "textile hand" task. We created a "Guess What Textile" interaction in which participants were given two textile samples -- a target and a reference -- to handle. Without seeing them, participants described the differences between them to the LLM. Using these descriptions, the LLM attempted to identify the target textile by assessing similarity within its high-dimensional embedding space. Our results suggest that a degree of perceptual alignment exists, however varies significantly among different textile samples. For example, LLM predictions are well aligned for silk satin, but not for cotton denim. Moreover, participants didn't perceive their textile experiences closely matched by the LLM predictions. This is only the first exploration into perceptual alignment around touch, exemplified through textile hand. We discuss possible sources of this alignment variance, and how better human-AI perceptual alignment can benefit future everyday tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06587v1-abstract-full').style.display = 'none'; document.getElementById('2406.06587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19718">arXiv:2405.19718</a> <span> [<a href="https://arxiv.org/pdf/2405.19718">pdf</a>, <a href="https://arxiv.org/format/2405.19718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LED: A Large-scale Real-world Paired Dataset for Event Camera Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yuxing Duan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Shihan Peng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Luxin Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19718v1-abstract-short" style="display: inline;"> Event camera has significant advantages in capturing dynamic scene information while being prone to noise interference, particularly in challenging conditions like low threshold and low illumination. However, most existing research focuses on gentle situations, hindering event camera applications in realistic complex scenarios. To tackle this limitation and advance the field, we construct a new pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19718v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19718v1-abstract-full" style="display: none;"> Event camera has significant advantages in capturing dynamic scene information while being prone to noise interference, particularly in challenging conditions like low threshold and low illumination. However, most existing research focuses on gentle situations, hindering event camera applications in realistic complex scenarios. To tackle this limitation and advance the field, we construct a new paired real-world event denoising dataset (LED), including 3K sequences with 18K seconds of high-resolution (1200*680) event streams and showing three notable distinctions compared to others: diverse noise levels and scenes, larger-scale with high-resolution, and high-quality GT. Specifically, it contains stepped parameters and varying illumination with diverse scenarios. Moreover, based on the property of noise events inconsistency and signal events consistency, we propose a novel effective denoising framework(DED) using homogeneous dual events to generate the GT with better separating noise from the raw. Furthermore, we design a bio-inspired baseline leveraging Leaky-Integrate-and-Fire (LIF) neurons with dynamic thresholds to realize accurate denoising. The experimental results demonstrate that the remarkable performance of the proposed approach on different datasets.The dataset and code are at https://github.com/Yee-Sing/led. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19718v1-abstract-full').style.display = 'none'; document.getElementById('2405.19718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18910">arXiv:2405.18910</a> <span> [<a href="https://arxiv.org/pdf/2405.18910">pdf</a>, <a href="https://arxiv.org/format/2405.18910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Predicting Parking Availability in Singapore with Cross-Domain Data: A New Dataset and A Data-Driven Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaiwu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yutong Xia</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Siru Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Z">Zekun Tong</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&query=Zimmermann%2C+R">Roger Zimmermann</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18910v1-abstract-short" style="display: inline;"> The increasing number of vehicles highlights the need for efficient parking space management. Predicting real-time Parking Availability (PA) can help mitigate traffic congestion and the corresponding social problems, which is a pressing issue in densely populated cities like Singapore. In this study, we aim to collectively predict future PA across Singapore with complex factors from various domain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18910v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18910v1-abstract-full" style="display: none;"> The increasing number of vehicles highlights the need for efficient parking space management. Predicting real-time Parking Availability (PA) can help mitigate traffic congestion and the corresponding social problems, which is a pressing issue in densely populated cities like Singapore. In this study, we aim to collectively predict future PA across Singapore with complex factors from various domains. The contributions in this paper are listed as follows: (1) A New Dataset: We introduce the \texttt{SINPA} dataset, containing a year's worth of PA data from 1,687 parking lots in Singapore, enriched with various spatial and temporal factors. (2) A Data-Driven Approach: We present DeepPA, a novel deep-learning framework, to collectively and efficiently predict future PA across thousands of parking lots. (3) Extensive Experiments and Deployment: DeepPA demonstrates a 9.2% reduction in prediction error for up to 3-hour forecasts compared to existing advanced models. Furthermore, we implement DeepPA in a practical web-based platform to provide real-time PA predictions to aid drivers and inform urban planning for the governors in Singapore. We release the dataset and source code at https://github.com/yoshall/SINPA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18910v1-abstract-full').style.display = 'none'; document.getElementById('2405.18910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IJCAI 2024 (Multi-Year Track On AI And Social Good with ~20% acceptance rate)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17503">arXiv:2405.17503</a> <span> [<a href="https://arxiv.org/pdf/2405.17503">pdf</a>, <a href="https://arxiv.org/format/2405.17503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Code Repair with LLMs gives an Exploration-Exploitation Tradeoff </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Keya Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+P">Jin Peng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sicheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei-Long Zheng</a>, <a href="/search/cs?searchtype=author&query=Si%2C+X">Xujie Si</a>, <a href="/search/cs?searchtype=author&query=Ellis%2C+K">Kevin Ellis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17503v3-abstract-short" style="display: inline;"> Iteratively improving and repairing source code with large language models (LLMs), known as refinement, has emerged as a popular way of generating programs that would be too complex to construct in one shot. Given a bank of test cases, together with a candidate program, an LLM can improve that program by being prompted with failed test cases. But it remains an open question how to best iteratively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17503v3-abstract-full').style.display = 'inline'; document.getElementById('2405.17503v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17503v3-abstract-full" style="display: none;"> Iteratively improving and repairing source code with large language models (LLMs), known as refinement, has emerged as a popular way of generating programs that would be too complex to construct in one shot. Given a bank of test cases, together with a candidate program, an LLM can improve that program by being prompted with failed test cases. But it remains an open question how to best iteratively refine code, with prior work employing simple greedy or breadth-first strategies. We show here that refinement exposes an explore-exploit tradeoff: exploit by refining the program that passes the most test cases, or explore by refining a lesser considered program. We frame this as an arm-acquiring bandit problem, which we solve with Thompson Sampling. The resulting LLM-based program synthesis algorithm is broadly applicable: Across loop invariant synthesis, visual reasoning puzzles, and competition programming problems, we find that our new method can solve more problems using fewer language model calls. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17503v3-abstract-full').style.display = 'none'; document.getElementById('2405.17503v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17042">arXiv:2405.17042</a> <span> [<a href="https://arxiv.org/pdf/2405.17042">pdf</a>, <a href="https://arxiv.org/format/2405.17042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> LabObf: A Label Protection Scheme for Vertical Federated Learning Through Label Obfuscation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Ying He</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+M">Mingyang Niu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jingyu Hua</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yunlong Mao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xu Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17042v2-abstract-short" style="display: inline;"> Split Neural Network, as one of the most common architectures used in vertical federated learning, is popular in industry due to its privacy-preserving characteristics. In this architecture, the party holding the labels seeks cooperation from other parties to improve model performance due to insufficient feature data. Each of these participants has a self-defined bottom model to learn hidden repre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17042v2-abstract-full').style.display = 'inline'; document.getElementById('2405.17042v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17042v2-abstract-full" style="display: none;"> Split Neural Network, as one of the most common architectures used in vertical federated learning, is popular in industry due to its privacy-preserving characteristics. In this architecture, the party holding the labels seeks cooperation from other parties to improve model performance due to insufficient feature data. Each of these participants has a self-defined bottom model to learn hidden representations from its own feature data and uploads the embedding vectors to the top model held by the label holder for final predictions. This design allows participants to conduct joint training without directly exchanging data. However, existing research points out that malicious participants may still infer label information from the uploaded embeddings, leading to privacy leakage. In this paper, we first propose an embedding extension attack manipulating embeddings to undermine existing defense strategies, which rely on constraining the correlation between the embeddings uploaded by participants and the labels. Subsequently, we propose a new label obfuscation defense strategy, called `LabObf', which randomly maps each original integer-valued label to multiple real-valued soft labels with values intertwined, significantly increasing the difficulty for attackers to infer the labels. We conduct experiments on four different types of datasets, and the results show that LabObf significantly reduces the attacker's success rate compared to raw models while maintaining desirable model accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17042v2-abstract-full').style.display = 'none'; document.getElementById('2405.17042v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04071">arXiv:2405.04071</a> <span> [<a href="https://arxiv.org/pdf/2405.04071">pdf</a>, <a href="https://arxiv.org/format/2405.04071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IMU-Aided Event-based Stereo Visual Odometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+J">Junkai Niu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04071v1-abstract-short" style="display: inline;"> Direct methods for event-based visual odometry solve the mapping and camera pose tracking sub-problems by establishing implicit data association in a way that the generative model of events is exploited. The main bottlenecks faced by state-of-the-art work in this field include the high computational complexity of mapping and the limited accuracy of tracking. In this paper, we improve our previous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04071v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04071v1-abstract-full" style="display: none;"> Direct methods for event-based visual odometry solve the mapping and camera pose tracking sub-problems by establishing implicit data association in a way that the generative model of events is exploited. The main bottlenecks faced by state-of-the-art work in this field include the high computational complexity of mapping and the limited accuracy of tracking. In this paper, we improve our previous direct pipeline \textit{Event-based Stereo Visual Odometry} in terms of accuracy and efficiency. To speed up the mapping operation, we propose an efficient strategy of edge-pixel sampling according to the local dynamics of events. The mapping performance in terms of completeness and local smoothness is also improved by combining the temporal stereo results and the static stereo results. To circumvent the degeneracy issue of camera pose tracking in recovering the yaw component of general 6-DoF motion, we introduce as a prior the gyroscope measurements via pre-integration. Experiments on publicly available datasets justify our improvement. We release our pipeline as an open-source software for future research in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04071v1-abstract-full').style.display = 'none'; document.getElementById('2405.04071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures, ICRA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00739">arXiv:2405.00739</a> <span> [<a href="https://arxiv.org/pdf/2405.00739">pdf</a>, <a href="https://arxiv.org/format/2405.00739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Why does Knowledge Distillation Work? Rethink its Attention and Fidelity Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenqi Guo</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shiwei Zhong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Q">Qianli Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yinglong Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00739v1-abstract-short" style="display: inline;"> Does Knowledge Distillation (KD) really work? Conventional wisdom viewed it as a knowledge transfer procedure where a perfect mimicry of the student to its teacher is desired. However, paradoxical studies indicate that closely replicating the teacher's behavior does not consistently improve student generalization, posing questions on its possible causes. Confronted with this gap, we hypothesize th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00739v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00739v1-abstract-full" style="display: none;"> Does Knowledge Distillation (KD) really work? Conventional wisdom viewed it as a knowledge transfer procedure where a perfect mimicry of the student to its teacher is desired. However, paradoxical studies indicate that closely replicating the teacher's behavior does not consistently improve student generalization, posing questions on its possible causes. Confronted with this gap, we hypothesize that diverse attentions in teachers contribute to better student generalization at the expense of reduced fidelity in ensemble KD setups. By increasing data augmentation strengths, our key findings reveal a decrease in the Intersection over Union (IoU) of attentions between teacher models, leading to reduced student overfitting and decreased fidelity. We propose this low-fidelity phenomenon as an underlying characteristic rather than a pathology when training KD. This suggests that stronger data augmentation fosters a broader perspective provided by the divergent teacher ensemble and lower student-teacher mutual information, benefiting generalization performance. These insights clarify the mechanism on low-fidelity phenomenon in KD. Thus, we offer new perspectives on optimizing student model performance, by emphasizing increased diversity in teacher attentions and reduced mimicry behavior between teachers and student. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00739v1-abstract-full').style.display = 'none'; document.getElementById('2405.00739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14241">arXiv:2404.14241</a> <span> [<a href="https://arxiv.org/pdf/2404.14241">pdf</a>, <a href="https://arxiv.org/format/2404.14241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UrbanCross: Enhancing Satellite Image-Text Retrieval with Cross-Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Siru Zhong</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xixuan Hao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yibo Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14241v1-abstract-short" style="display: inline;"> Urbanization challenges underscore the necessity for effective satellite image-text retrieval methods to swiftly access specific information enriched with geographic semantics for urban applications. However, existing methods often overlook significant domain gaps across diverse urban landscapes, primarily focusing on enhancing retrieval performance within single domains. To tackle this issue, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14241v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14241v1-abstract-full" style="display: none;"> Urbanization challenges underscore the necessity for effective satellite image-text retrieval methods to swiftly access specific information enriched with geographic semantics for urban applications. However, existing methods often overlook significant domain gaps across diverse urban landscapes, primarily focusing on enhancing retrieval performance within single domains. To tackle this issue, we present UrbanCross, a new framework for cross-domain satellite image-text retrieval. UrbanCross leverages a high-quality, cross-domain dataset enriched with extensive geo-tags from three countries to highlight domain diversity. It employs the Large Multimodal Model (LMM) for textual refinement and the Segment Anything Model (SAM) for visual augmentation, achieving a fine-grained alignment of images, segments and texts, yielding a 10% improvement in retrieval performance. Additionally, UrbanCross incorporates an adaptive curriculum-based source sampler and a weighted adversarial cross-domain fine-tuning module, progressively enhancing adaptability across various domains. Extensive experiments confirm UrbanCross's superior efficiency in retrieval and adaptation to new urban environments, demonstrating an average performance increase of 15% over its version without domain adaptation mechanisms, effectively bridging the domain gap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14241v1-abstract-full').style.display = 'none'; document.getElementById('2404.14241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhong%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository