Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 4,396 results for author: <span class="mathjax">Wang, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15014">arXiv:2411.15014</a> <span> [<a href="https://arxiv.org/pdf/2411.15014">pdf</a>, <a href="https://arxiv.org/format/2411.15014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the Linear Speedup of Personalized Federated Reinforcement Learning with Shared Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guojun Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shufan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Daniel Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15014v1-abstract-short" style="display: inline;"> Federated reinforcement learning (FedRL) enables multiple agents to collaboratively learn a policy without sharing their local trajectories collected during agent-environment interactions. However, in practice, the environments faced by different agents are often heterogeneous, leading to poor performance by the single policy learned by existing FedRL algorithms on individual agents. In this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15014v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15014v1-abstract-full" style="display: none;"> Federated reinforcement learning (FedRL) enables multiple agents to collaboratively learn a policy without sharing their local trajectories collected during agent-environment interactions. However, in practice, the environments faced by different agents are often heterogeneous, leading to poor performance by the single policy learned by existing FedRL algorithms on individual agents. In this paper, we take a further step and introduce a \emph{personalized} FedRL framework (PFedRL) by taking advantage of possibly shared common structure among agents in heterogeneous environments. Specifically, we develop a class of PFedRL algorithms named PFedRL-Rep that learns (1) a shared feature representation collaboratively among all agents, and (2) an agent-specific weight vector personalized to its local environment. We analyze the convergence of PFedTD-Rep, a particular instance of the framework with temporal difference (TD) learning and linear representations. To the best of our knowledge, we are the first to prove a linear convergence speedup with respect to the number of agents in the PFedRL setting. To achieve this, we show that PFedTD-Rep is an example of the federated two-timescale stochastic approximation with Markovian noise. Experimental results demonstrate that PFedTD-Rep, along with an extension to the control setting based on deep Q-networks (DQN), not only improve learning in heterogeneous settings, but also provide better generalization to new environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15014v1-abstract-full').style.display = 'none'; document.getElementById('2411.15014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14790">arXiv:2411.14790</a> <span> [<a href="https://arxiv.org/pdf/2411.14790">pdf</a>, <a href="https://arxiv.org/format/2411.14790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KBAda: Efficient Self Adaptation on Specific Knowledge Bases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zheni Zeng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14790v1-abstract-short" style="display: inline;"> Humans can utilize techniques to quickly acquire knowledge from specific materials in advance, such as creating self-assessment questions, enabling us to achieving related tasks more efficiently. In contrast, large language models (LLMs) usually relies on retrieval-augmented generation to exploit knowledge materials in an instant manner, or requires external signals such as human preference data a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14790v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14790v1-abstract-full" style="display: none;"> Humans can utilize techniques to quickly acquire knowledge from specific materials in advance, such as creating self-assessment questions, enabling us to achieving related tasks more efficiently. In contrast, large language models (LLMs) usually relies on retrieval-augmented generation to exploit knowledge materials in an instant manner, or requires external signals such as human preference data and stronger LLM annotations to conduct knowledge adaptation. To unleash the self-learning potential of LLMs, we propose KBAda, an approach designed for efficient adaptation to downstream tasks involving knowledge bases. Our method utilizes iterative training with self-annotated data such as Q&A pairs and revision suggestions, enabling the model to grasp the knowledge content efficiently. Experimental results on multiple datasets demonstrate the effectiveness of our approach, significantly boosting model performance in downstream tasks that require specific knowledge at a low cost. Notably, our approach achieves over 90% of the performance improvement that can be obtained by using GPT-4-turbo annotation, while relying entirely on self-supervision. We release our experimental data, models, and process analyses to the community for further exploration (https://github.com/thunlp/KBAda). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14790v1-abstract-full').style.display = 'none'; document.getElementById('2411.14790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14560">arXiv:2411.14560</a> <span> [<a href="https://arxiv.org/pdf/2411.14560">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3687123.3698290">10.1145/3687123.3698290 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing GeoAI and location encoding with spatial point pattern statistics: A Case Study of Terrain Feature Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sizhe Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenwen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14560v1-abstract-short" style="display: inline;"> This study introduces a novel approach to terrain feature classification by incorporating spatial point pattern statistics into deep learning models. Inspired by the concept of location encoding, which aims to capture location characteristics to enhance GeoAI decision-making capabilities, we improve the GeoAI model by a knowledge driven approach to integrate both first-order and second-order effec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14560v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14560v1-abstract-full" style="display: none;"> This study introduces a novel approach to terrain feature classification by incorporating spatial point pattern statistics into deep learning models. Inspired by the concept of location encoding, which aims to capture location characteristics to enhance GeoAI decision-making capabilities, we improve the GeoAI model by a knowledge driven approach to integrate both first-order and second-order effects of point patterns. This paper investigates how these spatial contexts impact the accuracy of terrain feature predictions. The results show that incorporating spatial point pattern statistics notably enhances model performance by leveraging different representations of spatial relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14560v1-abstract-full').style.display = 'none'; document.getElementById('2411.14560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages with 1 figure. Accepted in 7th ACM SIGSPATIAL International Workshop on AI for Geographic Knowledge Discovery</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14476">arXiv:2411.14476</a> <span> [<a href="https://arxiv.org/pdf/2411.14476">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StreetviewLLM: Extracting Geographic Information Using a Chain-of-Thought Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongrong Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Junhao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siqin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haiyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14476v1-abstract-short" style="display: inline;"> Geospatial predictions are crucial for diverse fields such as disaster management, urban planning, and public health. Traditional machine learning methods often face limitations when handling unstructured or multi-modal data like street view imagery. To address these challenges, we propose StreetViewLLM, a novel framework that integrates a large language model with the chain-of-thought reasoning a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14476v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14476v1-abstract-full" style="display: none;"> Geospatial predictions are crucial for diverse fields such as disaster management, urban planning, and public health. Traditional machine learning methods often face limitations when handling unstructured or multi-modal data like street view imagery. To address these challenges, we propose StreetViewLLM, a novel framework that integrates a large language model with the chain-of-thought reasoning and multimodal data sources. By combining street view imagery with geographic coordinates and textual data, StreetViewLLM improves the precision and granularity of geospatial predictions. Using retrieval-augmented generation techniques, our approach enhances geographic information extraction, enabling a detailed analysis of urban environments. The model has been applied to seven global cities, including Hong Kong, Tokyo, Singapore, Los Angeles, New York, London, and Paris, demonstrating superior performance in predicting urban indicators, including population density, accessibility to healthcare, normalized difference vegetation index, building height, and impervious surface. The results show that StreetViewLLM consistently outperforms baseline models, offering improved predictive accuracy and deeper insights into the built environment. This research opens new opportunities for integrating the large language model into urban analytics, decision-making in urban planning, infrastructure management, and environmental monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14476v1-abstract-full').style.display = 'none'; document.getElementById('2411.14476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14135">arXiv:2411.14135</a> <span> [<a href="https://arxiv.org/pdf/2411.14135">pdf</a>, <a href="https://arxiv.org/format/2411.14135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Compact Visual Data Representation for Green Multimedia -- A Human Visual System Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peilin Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xiaohan Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Siwei Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14135v1-abstract-short" style="display: inline;"> The Human Visual System (HVS), with its intricate sophistication, is capable of achieving ultra-compact information compression for visual signals. This remarkable ability is coupled with high generalization capability and energy efficiency. By contrast, the state-of-the-art Versatile Video Coding (VVC) standard achieves a compression ratio of around 1,000 times for raw visual data. This notable d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14135v1-abstract-full" style="display: none;"> The Human Visual System (HVS), with its intricate sophistication, is capable of achieving ultra-compact information compression for visual signals. This remarkable ability is coupled with high generalization capability and energy efficiency. By contrast, the state-of-the-art Versatile Video Coding (VVC) standard achieves a compression ratio of around 1,000 times for raw visual data. This notable disparity motivates the research community to draw inspiration to effectively handle the immense volume of visual data in a green way. Therefore, this paper provides a survey of how visual data can be efficiently represented for green multimedia, in particular when the ultimate task is knowledge extraction instead of visual signal reconstruction. We introduce recent research efforts that promote green, sustainable, and efficient multimedia in this field. Moreover, we discuss how the deep understanding of the HVS can benefit the research community, and envision the development of future green multimedia technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14135v1-abstract-full').style.display = 'none'; document.getElementById('2411.14135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13817">arXiv:2411.13817</a> <span> [<a href="https://arxiv.org/pdf/2411.13817">pdf</a>, <a href="https://arxiv.org/format/2411.13817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Structural Clustering Unleashed: Flexible Similarities, Versatile Updates and for All Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhuowei Zhao</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+J">Junhao Gan</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+B">Boyu Ruan</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Z">Zhifeng Bao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sibo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13817v1-abstract-short" style="display: inline;"> We study structural clustering on graphs in dynamic scenarios, where the graphs can be updated by arbitrary insertions or deletions of edges/vertices. The goal is to efficiently compute structural clustering results for any clustering parameters $蔚$ and $渭$ given on the fly, for arbitrary graph update patterns, and for all typical similarity measurements. Specifically, we adopt the idea of update… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13817v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13817v1-abstract-full" style="display: none;"> We study structural clustering on graphs in dynamic scenarios, where the graphs can be updated by arbitrary insertions or deletions of edges/vertices. The goal is to efficiently compute structural clustering results for any clustering parameters $蔚$ and $渭$ given on the fly, for arbitrary graph update patterns, and for all typical similarity measurements. Specifically, we adopt the idea of update affordability and propose an a-lot-simpler yet more efficient (both theoretically and practically) algorithm (than state of the art), named VD-STAR to handle graph updates. First, with a theoretical clustering result quality guarantee, VD-STAR can output high-quality clustering results with up to 99.9% accuracy. Second, our VD-STAR is easy to implement as it just needs to maintain certain sorted linked lists and hash tables, and hence, effectively enhances its deployment in practice. Third and most importantly, by careful analysis, VD-STAR improves the per-update time bound of the state-of-the-art from $O(\log^2 n)$ expected with certain update pattern assumption to $O(\log n)$ amortized in expectation without any update pattern assumption. We further design two variants of VD-STAR to enhance its empirical performance. Experimental results show that our algorithms consistently outperform the state-of-the-art competitors by up to 9,315 times in update time across nine real datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13817v1-abstract-full').style.display = 'none'; document.getElementById('2411.13817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13797">arXiv:2411.13797</a> <span> [<a href="https://arxiv.org/pdf/2411.13797">pdf</a>, <a href="https://arxiv.org/format/2411.13797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hugging Rain Man: A Novel Facial Action Units Dataset for Analyzing Atypical Facial Expressions in Children with Autism Spectrum Disorder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shutong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruyi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingying Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinzhou Jiang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhengyu Deng</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Y">Yuxuan Quan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junpeng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13797v1-abstract-short" style="display: inline;"> Children with Autism Spectrum Disorder (ASD) often exhibit atypical facial expressions. However, the specific objective facial features that underlie this subjective perception remain unclear. In this paper, we introduce a novel dataset, Hugging Rain Man (HRM), which includes facial action units (AUs) manually annotated by FACS experts for both children with ASD and typical development (TD). The d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13797v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13797v1-abstract-full" style="display: none;"> Children with Autism Spectrum Disorder (ASD) often exhibit atypical facial expressions. However, the specific objective facial features that underlie this subjective perception remain unclear. In this paper, we introduce a novel dataset, Hugging Rain Man (HRM), which includes facial action units (AUs) manually annotated by FACS experts for both children with ASD and typical development (TD). The dataset comprises a rich collection of posed and spontaneous facial expressions, totaling approximately 130,000 frames, along with 22 AUs, 10 Action Descriptors (ADs), and atypicality ratings. A statistical analysis of static images from the HRM reveals significant differences between the ASD and TD groups across multiple AUs and ADs when displaying the same emotional expressions, confirming that participants with ASD tend to demonstrate more irregular and diverse expression patterns. Subsequently, a temporal regression method was presented to analyze atypicality of dynamic sequences, thereby bridging the gap between subjective perception and objective facial characteristics. Furthermore, baseline results for AU detection are provided for future research reference. This work not only contributes to our understanding of the unique facial expression characteristics associated with ASD but also provides potential tools for ASD early screening. Portions of the dataset, features, and pretrained models are accessible at: \url{https://github.com/Jonas-DL/Hugging-Rain-Man}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13797v1-abstract-full').style.display = 'none'; document.getElementById('2411.13797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Portions of the dataset, features, and pretrained models are accessible at: https://github.com/Jonas-DL/Hugging-Rain-Man</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13127">arXiv:2411.13127</a> <span> [<a href="https://arxiv.org/pdf/2411.13127">pdf</a>, <a href="https://arxiv.org/format/2411.13127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adapting Vision Foundation Models for Robust Cloud Segmentation in Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+X">Xuechao Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiying Wang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+J">Junliang Xing</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Lei Jin</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+C">Congyan Lang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+P">Pin Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13127v1-abstract-short" style="display: inline;"> Cloud segmentation is a critical challenge in remote sensing image interpretation, as its accuracy directly impacts the effectiveness of subsequent data processing and analysis. Recently, vision foundation models (VFM) have demonstrated powerful generalization capabilities across various visual tasks. In this paper, we present a parameter-efficient adaptive approach, termed Cloud-Adapter, designed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13127v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13127v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13127v1-abstract-full" style="display: none;"> Cloud segmentation is a critical challenge in remote sensing image interpretation, as its accuracy directly impacts the effectiveness of subsequent data processing and analysis. Recently, vision foundation models (VFM) have demonstrated powerful generalization capabilities across various visual tasks. In this paper, we present a parameter-efficient adaptive approach, termed Cloud-Adapter, designed to enhance the accuracy and robustness of cloud segmentation. Our method leverages a VFM pretrained on general domain data, which remains frozen, eliminating the need for additional training. Cloud-Adapter incorporates a lightweight spatial perception module that initially utilizes a convolutional neural network (ConvNet) to extract dense spatial representations. These multi-scale features are then aggregated and serve as contextual inputs to an adapting module, which modulates the frozen transformer layers within the VFM. Experimental results demonstrate that the Cloud-Adapter approach, utilizing only 0.6% of the trainable parameters of the frozen backbone, achieves substantial performance gains. Cloud-Adapter consistently attains state-of-the-art (SOTA) performance across a wide variety of cloud segmentation datasets from multiple satellite sources, sensor series, data processing levels, land cover scenarios, and annotation granularities. We have released the source code and pretrained models at https://github.com/XavierJiezou/Cloud-Adapter to support further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13127v1-abstract-full').style.display = 'none'; document.getElementById('2411.13127v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12791">arXiv:2411.12791</a> <span> [<a href="https://arxiv.org/pdf/2411.12791">pdf</a>, <a href="https://arxiv.org/format/2411.12791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Perception Bias: A Training-Free Approach to Enhance LMM for Image Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+S">Siyi Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Baoliang Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danni Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hanwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+X">Xiangjie Sui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12791v1-abstract-short" style="display: inline;"> Despite the impressive performance of large multimodal models (LMMs) in high-level visual tasks, their capacity for image quality assessment (IQA) remains limited. One main reason is that LMMs are primarily trained for high-level tasks (e.g., image captioning), emphasizing unified image semantics extraction under varied quality. Such semantic-aware yet quality-insensitive perception bias inevitabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12791v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12791v1-abstract-full" style="display: none;"> Despite the impressive performance of large multimodal models (LMMs) in high-level visual tasks, their capacity for image quality assessment (IQA) remains limited. One main reason is that LMMs are primarily trained for high-level tasks (e.g., image captioning), emphasizing unified image semantics extraction under varied quality. Such semantic-aware yet quality-insensitive perception bias inevitably leads to a heavy reliance on image semantics when those LMMs are forced for quality rating. In this paper, instead of retraining or tuning an LMM costly, we propose a training-free debiasing framework, in which the image quality prediction is rectified by mitigating the bias caused by image semantics. Specifically, we first explore several semantic-preserving distortions that can significantly degrade image quality while maintaining identifiable semantics. By applying these specific distortions to the query or test images, we ensure that the degraded images are recognized as poor quality while their semantics remain. During quality inference, both a query image and its corresponding degraded version are fed to the LMM along with a prompt indicating that the query image quality should be inferred under the condition that the degraded one is deemed poor quality.This prior condition effectively aligns the LMM's quality perception, as all degraded images are consistently rated as poor quality, regardless of their semantic difference.Finally, the quality scores of the query image inferred under different prior conditions (degraded versions) are aggregated using a conditional probability model. Extensive experiments on various IQA datasets show that our debiasing framework could consistently enhance the LMM performance and the code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12791v1-abstract-full').style.display = 'none'; document.getElementById('2411.12791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12767">arXiv:2411.12767</a> <span> [<a href="https://arxiv.org/pdf/2411.12767">pdf</a>, <a href="https://arxiv.org/format/2411.12767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Suicide Risk Assessment on Social Media with Semi-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lovitt%2C+M">Max Lovitt</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Haotian Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yifan Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12767v1-abstract-short" style="display: inline;"> With social media communities increasingly becoming places where suicidal individuals post and congregate, natural language processing presents an exciting avenue for the development of automated suicide risk assessment systems. However, past efforts suffer from a lack of labeled data and class imbalances within the available labeled data. To accommodate this task's imperfect data landscape, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12767v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12767v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12767v1-abstract-full" style="display: none;"> With social media communities increasingly becoming places where suicidal individuals post and congregate, natural language processing presents an exciting avenue for the development of automated suicide risk assessment systems. However, past efforts suffer from a lack of labeled data and class imbalances within the available labeled data. To accommodate this task's imperfect data landscape, we propose a semi-supervised framework that leverages labeled (n=500) and unlabeled (n=1,500) data and expands upon the self-training algorithm with a novel pseudo-label acquisition process designed to handle imbalanced datasets. To further ensure pseudo-label quality, we manually verify a subset of the pseudo-labeled data that was not predicted unanimously across multiple trials of pseudo-label generation. We test various models to serve as the backbone for this framework, ultimately deciding that RoBERTa performs the best. Ultimately, by leveraging partially validated pseudo-labeled data in addition to ground-truth labeled data, we substantially improve our model's ability to assess suicide risk from social media posts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12767v1-abstract-full').style.display = 'none'; document.getElementById('2411.12767v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in the 2024 IEEE International Conference on Big Data</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12547">arXiv:2411.12547</a> <span> [<a href="https://arxiv.org/pdf/2411.12547">pdf</a>, <a href="https://arxiv.org/format/2411.12547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> S3TU-Net: Structured Convolution and Superpixel Transformer for Lung Nodule Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuke Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunyu Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenglei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">YuQing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S+H">Shuo Hong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12547v1-abstract-short" style="display: inline;"> The irregular and challenging characteristics of lung adenocarcinoma nodules in computed tomography (CT) images complicate staging diagnosis, making accurate segmentation critical for clinicians to extract detailed lesion information. In this study, we propose a segmentation model, S3TU-Net, which integrates multi-dimensional spatial connectors and a superpixel-based visual transformer. S3TU-Net i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12547v1-abstract-full" style="display: none;"> The irregular and challenging characteristics of lung adenocarcinoma nodules in computed tomography (CT) images complicate staging diagnosis, making accurate segmentation critical for clinicians to extract detailed lesion information. In this study, we propose a segmentation model, S3TU-Net, which integrates multi-dimensional spatial connectors and a superpixel-based visual transformer. S3TU-Net is built on a multi-view CNN-Transformer hybrid architecture, incorporating superpixel algorithms, structured weighting, and spatial shifting techniques to achieve superior segmentation performance. The model leverages structured convolution blocks (DWF-Conv/D2BR-Conv) to extract multi-scale local features while mitigating overfitting. To enhance multi-scale feature fusion, we introduce the S2-MLP Link, integrating spatial shifting and attention mechanisms at the skip connections. Additionally, the residual-based superpixel visual transformer (RM-SViT) effectively merges global and local features by employing sparse correlation learning and multi-branch attention to capture long-range dependencies, with residual connections enhancing stability and computational efficiency. Experimental results on the LIDC-IDRI dataset demonstrate that S3TU-Net achieves a DSC, precision, and IoU of 89.04%, 90.73%, and 90.70%, respectively. Compared to recent methods, S3TU-Net improves DSC by 4.52% and sensitivity by 3.16%, with other metrics showing an approximate 2% increase. In addition to comparison and ablation studies, we validated the generalization ability of our model on the EPDB private dataset, achieving a DSC of 86.40%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12547v1-abstract-full').style.display = 'none'; document.getElementById('2411.12547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12478">arXiv:2411.12478</a> <span> [<a href="https://arxiv.org/pdf/2411.12478">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Robotic transcatheter tricuspid valve replacement with hybrid enhanced intelligence: a new paradigm and first-in-vivo study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuangyi Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haichuan Lin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiping Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+L">Longyue Tan</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xilong Hou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shengtao Lin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+F">Fei Pan</a>, <a href="/search/cs?searchtype=author&query=So%2C+K+C">Kent Chak-Yu So</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12478v1-abstract-short" style="display: inline;"> Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for tricuspid regurgitation and is in the early stages of clinical adoption. Intelligent robotic approaches are expected to overcome the challenges of surgical manipulation and widespread dissemination, but systems and protocols with high clinical utility have not yet been reported. In this study, we propose a complete soluti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12478v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12478v1-abstract-full" style="display: none;"> Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for tricuspid regurgitation and is in the early stages of clinical adoption. Intelligent robotic approaches are expected to overcome the challenges of surgical manipulation and widespread dissemination, but systems and protocols with high clinical utility have not yet been reported. In this study, we propose a complete solution that includes a passive stabilizer, robotic drive, detachable delivery catheter and valve manipulation mechanism. Working towards autonomy, a hybrid augmented intelligence approach based on reinforcement learning, Monte Carlo probabilistic maps and human-robot co-piloted control was introduced. Systematic tests in phantom and first-in-vivo animal experiments were performed to verify that the system design met the clinical requirement. Furthermore, the experimental results confirmed the advantages of co-piloted control over conventional master-slave control in terms of time efficiency, control efficiency, autonomy and stability of operation. In conclusion, this study provides a comprehensive pathway for robotic TTVR and, to our knowledge, completes the first animal study that not only successfully demonstrates the application of hybrid enhanced intelligence in interventional robotics, but also provides a solution with high application value for a cutting-edge procedure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12478v1-abstract-full').style.display = 'none'; document.getElementById('2411.12478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12448">arXiv:2411.12448</a> <span> [<a href="https://arxiv.org/pdf/2411.12448">pdf</a>, <a href="https://arxiv.org/format/2411.12448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Lossless Image Compression: Next-Pixel Prediction in Language Space is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kecheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pingping Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yibing Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaxin Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12448v2-abstract-short" style="display: inline;"> We have recently witnessed that ``Intelligence" and `` Compression" are the two sides of the same coin, where the language large model (LLM) with unprecedented intelligence is a general-purpose lossless compressor for various data modalities. This attribute particularly appeals to the lossless image compression community, given the increasing need to compress high-resolution images in the current… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12448v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12448v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12448v2-abstract-full" style="display: none;"> We have recently witnessed that ``Intelligence" and `` Compression" are the two sides of the same coin, where the language large model (LLM) with unprecedented intelligence is a general-purpose lossless compressor for various data modalities. This attribute particularly appeals to the lossless image compression community, given the increasing need to compress high-resolution images in the current streaming media era. Consequently, a spontaneous envision emerges: Can the compression performance of the LLM elevate lossless image compression to new heights? However, our findings indicate that the naive application of LLM-based lossless image compressors suffers from a considerable performance gap compared with existing state-of-the-art (SOTA) codecs on common benchmark datasets. In light of this, we are dedicated to fulfilling the unprecedented intelligence (compression) capacity of the LLM for lossless image compression tasks, thereby bridging the gap between theoretical and practical compression performance. Specifically, we propose P$^{2}$-LLM, a next-pixel prediction-based LLM, which integrates various elaborated insights and methodologies, \textit{e.g.,} pixel-level priors, the in-context ability of LLM, and a pixel-level semantic preservation strategy, to enhance the understanding capacity of pixel sequences for better next-pixel predictions. Extensive experiments on benchmark datasets demonstrate that P$^{2}$-LLM can beat SOTA classical and learned codecs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12448v2-abstract-full').style.display = 'none'; document.getElementById('2411.12448v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12347">arXiv:2411.12347</a> <span> [<a href="https://arxiv.org/pdf/2411.12347">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Leveraging NFTs for Spectrum Securitization in 6G Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhixian Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chen Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peichang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12347v1-abstract-short" style="display: inline;"> Dynamic Spectrum Sharing can enhance spectrum resource utilization by promoting the dynamic distribution of spectrum resources. However, to effectively implement dynamic spectrum resource allocation, certain mechanisms are needed to incentivize primary users to proactively share their spectrum resources. This paper, based on the ERC404 standard and integrating Non-Fungible Token and Fungible Token… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12347v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12347v1-abstract-full" style="display: none;"> Dynamic Spectrum Sharing can enhance spectrum resource utilization by promoting the dynamic distribution of spectrum resources. However, to effectively implement dynamic spectrum resource allocation, certain mechanisms are needed to incentivize primary users to proactively share their spectrum resources. This paper, based on the ERC404 standard and integrating Non-Fungible Token and Fungible Token technologies, proposes a spectrum securitization model to incentivize spectrum resource sharing and implements it on the Ethereum test net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12347v1-abstract-full').style.display = 'none'; document.getElementById('2411.12347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12195">arXiv:2411.12195</a> <span> [<a href="https://arxiv.org/pdf/2411.12195">pdf</a>, <a href="https://arxiv.org/format/2411.12195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Medical Vision-and-Language Applications and Their Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoshan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sinuo Wang</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+V+M+H">Vu Minh Hieu Phan</a>, <a href="/search/cs?searchtype=author&query=Hengel%2C+A+v+d">Anton van den Hengel</a>, <a href="/search/cs?searchtype=author&query=Verjans%2C+J">Johan Verjans</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zhibin Liao</a>, <a href="/search/cs?searchtype=author&query=To%2C+M">Minh-Son To</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yong Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12195v1-abstract-short" style="display: inline;"> Medical vision-and-language models (MVLMs) have attracted substantial interest due to their capability to offer a natural language interface for interpreting complex medical data. Their applications are versatile and have the potential to improve diagnostic accuracy and decision-making for individual patients while also contributing to enhanced public health monitoring, disease surveillance, and p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12195v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12195v1-abstract-full" style="display: none;"> Medical vision-and-language models (MVLMs) have attracted substantial interest due to their capability to offer a natural language interface for interpreting complex medical data. Their applications are versatile and have the potential to improve diagnostic accuracy and decision-making for individual patients while also contributing to enhanced public health monitoring, disease surveillance, and policy-making through more efficient analysis of large data sets. MVLMS integrate natural language processing with medical images to enable a more comprehensive and contextual understanding of medical images alongside their corresponding textual information. Unlike general vision-and-language models trained on diverse, non-specialized datasets, MVLMs are purpose-built for the medical domain, automatically extracting and interpreting critical information from medical images and textual reports to support clinical decision-making. Popular clinical applications of MVLMs include automated medical report generation, medical visual question answering, medical multimodal segmentation, diagnosis and prognosis and medical image-text retrieval. Here, we provide a comprehensive overview of MVLMs and the various medical tasks to which they have been applied. We conduct a detailed analysis of various vision-and-language model architectures, focusing on their distinct strategies for cross-modal integration/exploitation of medical visual and textual features. We also examine the datasets used for these tasks and compare the performance of different models based on standardized evaluation metrics. Furthermore, we highlight potential challenges and summarize future research trends and directions. The full collection of papers and codes is available at: https://github.com/YtongXie/Medical-Vision-and-Language-Tasks-and-Methodologies-A-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12195v1-abstract-full').style.display = 'none'; document.getElementById('2411.12195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12183">arXiv:2411.12183</a> <span> [<a href="https://arxiv.org/pdf/2411.12183">pdf</a>, <a href="https://arxiv.org/format/2411.12183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of Beamlines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyu Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shengran Dai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jianhui Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yufei Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junbin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12183v1-abstract-short" style="display: inline;"> Synchrotron radiation sources play a crucial role in fields such as materials science, biology, and chemistry. The beamline, a key subsystem of the synchrotron, modulates and directs the radiation to the sample for analysis. However, the alignment of beamlines is a complex and time-consuming process, primarily carried out manually by experienced engineers. Even minor misalignments in optical compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12183v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12183v1-abstract-full" style="display: none;"> Synchrotron radiation sources play a crucial role in fields such as materials science, biology, and chemistry. The beamline, a key subsystem of the synchrotron, modulates and directs the radiation to the sample for analysis. However, the alignment of beamlines is a complex and time-consuming process, primarily carried out manually by experienced engineers. Even minor misalignments in optical components can significantly affect the beam's properties, leading to suboptimal experimental outcomes. Current automated methods, such as bayesian optimization (BO) and reinforcement learning (RL), although these methods enhance performance, limitations remain. The relationship between the current and target beam properties, crucial for determining the adjustment, is not fully considered. Additionally, the physical characteristics of optical elements are overlooked, such as the need to adjust specific devices to control the output beam's spot size or position. This paper addresses the alignment of beamlines by modeling it as a Markov Decision Process (MDP) and training an intelligent agent using RL. The agent calculates adjustment values based on the current and target beam states, executes actions, and iterates until optimal parameters are achieved. A policy network with action attention is designed to improve decision-making by considering both state differences and the impact of optical components. Experiments on two simulated beamlines demonstrate that our algorithm outperforms existing methods, with ablation studies highlighting the effectiveness of the action attention-based policy network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12183v1-abstract-full').style.display = 'none'; document.getElementById('2411.12183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12157">arXiv:2411.12157</a> <span> [<a href="https://arxiv.org/pdf/2411.12157">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Combined Encoder and Transformer Approach for Coherent and High-Quality Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhen Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chihang Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hongye Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12157v1-abstract-short" style="display: inline;"> This research introduces a novel text generation model that combines BERT's semantic interpretation strengths with GPT-4's generative capabilities, establishing a high standard in generating coherent, contextually accurate language. Through the combined architecture, the model enhances semantic depth and maintains smooth, human-like text flow, overcoming limitations seen in prior models. Experimen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12157v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12157v1-abstract-full" style="display: none;"> This research introduces a novel text generation model that combines BERT's semantic interpretation strengths with GPT-4's generative capabilities, establishing a high standard in generating coherent, contextually accurate language. Through the combined architecture, the model enhances semantic depth and maintains smooth, human-like text flow, overcoming limitations seen in prior models. Experimental benchmarks reveal that BERT-GPT-4 surpasses traditional models, including GPT-3, T5, BART, Transformer-XL, and CTRL, in key metrics like Perplexity and BLEU, showcasing its superior natural language generation performance. By fully utilizing contextual information, this hybrid model generates text that is not only logically coherent but also aligns closely with human language patterns, providing an advanced solution for text generation tasks. This research highlights the potential of integrating semantic understanding with advanced generative models, contributing new insights for NLP, and setting a foundation for broader applications of large-scale generative architectures in areas such as automated writing, question-answer systems, and adaptive conversational agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12157v1-abstract-full').style.display = 'none'; document.getElementById('2411.12157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12000">arXiv:2411.12000</a> <span> [<a href="https://arxiv.org/pdf/2411.12000">pdf</a>, <a href="https://arxiv.org/format/2411.12000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ByteScience: Bridging Unstructured Scientific Literature and Structured Data with Auto Fine-tuned Large Language Model in Token Granularity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tong Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaozhou Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yuwei Wan</a>, <a href="/search/cs?searchtype=author&query=Razzak%2C+I">Imran Razzak</a>, <a href="/search/cs?searchtype=author&query=Kit%2C+C">Chunyu Kit</a>, <a href="/search/cs?searchtype=author&query=Hoex%2C+W+Z+B">Wenjie Zhangand Bram Hoex</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12000v1-abstract-short" style="display: inline;"> Natural Language Processing (NLP) is widely used to supply summarization ability from long context to structured information. However, extracting structured knowledge from scientific text by NLP models remains a challenge because of its domain-specific nature to complex data preprocessing and the granularity of multi-layered device-level information. To address this, we introduce ByteScience, a no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12000v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12000v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12000v1-abstract-full" style="display: none;"> Natural Language Processing (NLP) is widely used to supply summarization ability from long context to structured information. However, extracting structured knowledge from scientific text by NLP models remains a challenge because of its domain-specific nature to complex data preprocessing and the granularity of multi-layered device-level information. To address this, we introduce ByteScience, a non-profit cloud-based auto fine-tuned Large Language Model (LLM) platform, which is designed to extract structured scientific data and synthesize new scientific knowledge from vast scientific corpora. The platform capitalizes on DARWIN, an open-source, fine-tuned LLM dedicated to natural science. The platform was built on Amazon Web Services (AWS) and provides an automated, user-friendly workflow for custom model development and data extraction. The platform achieves remarkable accuracy with only a small amount of well-annotated articles. This innovative tool streamlines the transition from the science literature to structured knowledge and data and benefits the advancements in natural informatics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12000v1-abstract-full').style.display = 'none'; document.getElementById('2411.12000v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11929">arXiv:2411.11929</a> <span> [<a href="https://arxiv.org/pdf/2411.11929">pdf</a>, <a href="https://arxiv.org/format/2411.11929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> ChatHTTPFuzz: Large Language Model-Assisted IoT HTTP Fuzzing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yanling Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingwei Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Haohua Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuhai Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11929v1-abstract-short" style="display: inline;"> Internet of Things (IoT) devices offer convenience through web interfaces, web VPNs, and other web-based services, all relying on the HTTP protocol. However, these externally exposed HTTP services resent significant security risks. Although fuzzing has shown some effectiveness in identifying vulnerabilities in IoT HTTP services, most state-of-the-art tools still rely on random mutation trategies,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11929v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11929v1-abstract-full" style="display: none;"> Internet of Things (IoT) devices offer convenience through web interfaces, web VPNs, and other web-based services, all relying on the HTTP protocol. However, these externally exposed HTTP services resent significant security risks. Although fuzzing has shown some effectiveness in identifying vulnerabilities in IoT HTTP services, most state-of-the-art tools still rely on random mutation trategies, leading to difficulties in accurately understanding the HTTP protocol's structure and generating many invalid test cases. Furthermore, These fuzzers rely on a limited set of initial seeds for testing. While this approach initiates testing, the limited number and diversity of seeds hinder comprehensive coverage of complex scenarios in IoT HTTP services. In this paper, we investigate and find that large language models (LLMs) excel in parsing HTTP protocol data and analyzing code logic. Based on these findings, we propose a novel LLM-guided IoT HTTP fuzzing method, ChatHTTPFuzz, which automatically parses protocol fields and analyzes service code logic to generate protocol-compliant test cases. Specifically, we use LLMs to label fields in HTTP protocol data, creating seed templates. Second, The LLM analyzes service code to guide the generation of additional packets aligned with the code logic, enriching the seed templates and their field values. Finally, we design an enhanced Thompson sampling algorithm based on the exploration balance factor and mutation potential factor to schedule seed templates. We evaluate ChatHTTPFuzz on 14 different real-world IoT devices. It finds more vulnerabilities than SNIPUZZ, BOOFUZZ, and MUTINY. ChatHTTPFuzz has discovered 103 vulnerabilities, of which 68 are unique, and 23 have been assigned CVEs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11929v1-abstract-full').style.display = 'none'; document.getElementById('2411.11929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11845">arXiv:2411.11845</a> <span> [<a href="https://arxiv.org/pdf/2411.11845">pdf</a>, <a href="https://arxiv.org/format/2411.11845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> UniHands: Unifying Various Wild-Collected Keypoints for Personalized Hand Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Menghe Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Joonyeoup Kim</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yangwen Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuangquan Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kee-Bong Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11845v1-abstract-short" style="display: inline;"> Accurate hand motion capture and standardized 3D representation are essential for various hand-related tasks. Collecting keypoints-only data, while efficient and cost-effective, results in low-fidelity representations and lacks surface information. Furthermore, data inconsistencies across sources challenge their integration and use. We present UniHands, a novel method for creating standardized yet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11845v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11845v1-abstract-full" style="display: none;"> Accurate hand motion capture and standardized 3D representation are essential for various hand-related tasks. Collecting keypoints-only data, while efficient and cost-effective, results in low-fidelity representations and lacks surface information. Furthermore, data inconsistencies across sources challenge their integration and use. We present UniHands, a novel method for creating standardized yet personalized hand models from wild-collected keypoints from diverse sources. Unlike existing neural implicit representation methods, UniHands uses the widely-adopted parametric models MANO and NIMBLE, providing a more scalable and versatile solution. It also derives unified hand joints from the meshes, which facilitates seamless integration into various hand-related tasks. Experiments on the FreiHAND and InterHand2.6M datasets demonstrate its ability to precisely reconstruct hand mesh vertices and keypoints, effectively capturing high-degree articulation motions. Empirical studies involving nine participants show a clear preference for our unified joints over existing configurations for accuracy and naturalism (p-value 0.016). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11845v1-abstract-full').style.display = 'none'; document.getElementById('2411.11845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11739">arXiv:2411.11739</a> <span> [<a href="https://arxiv.org/pdf/2411.11739">pdf</a>, <a href="https://arxiv.org/format/2411.11739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QARM: Quantitative Alignment Multi-Modal Recommendation at Kuaishou </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xinchen Luo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianyu Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jinkai Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rui Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hezheng Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yichen Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qigen Hu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+C">Changqing Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiheng Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Simin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Mingxing Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaojie Liu</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guorui Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11739v1-abstract-short" style="display: inline;"> In recent years, with the significant evolution of multi-modal large models, many recommender researchers realized the potential of multi-modal information for user interest modeling. In industry, a wide-used modeling architecture is a cascading paradigm: (1) first pre-training a multi-modal model to provide omnipotent representations for downstream services; (2) The downstream recommendation mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11739v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11739v1-abstract-full" style="display: none;"> In recent years, with the significant evolution of multi-modal large models, many recommender researchers realized the potential of multi-modal information for user interest modeling. In industry, a wide-used modeling architecture is a cascading paradigm: (1) first pre-training a multi-modal model to provide omnipotent representations for downstream services; (2) The downstream recommendation model takes the multi-modal representation as additional input to fit real user-item behaviours. Although such paradigm achieves remarkable improvements, however, there still exist two problems that limit model performance: (1) Representation Unmatching: The pre-trained multi-modal model is always supervised by the classic NLP/CV tasks, while the recommendation models are supervised by real user-item interaction. As a result, the two fundamentally different tasks' goals were relatively separate, and there was a lack of consistent objective on their representations; (2) Representation Unlearning: The generated multi-modal representations are always stored in cache store and serve as extra fixed input of recommendation model, thus could not be updated by recommendation model gradient, further unfriendly for downstream training. Inspired by the two difficulties challenges in downstream tasks usage, we introduce a quantitative multi-modal framework to customize the specialized and trainable multi-modal information for different downstream models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11739v1-abstract-full').style.display = 'none'; document.getElementById('2411.11739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> N/A </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11389">arXiv:2411.11389</a> <span> [<a href="https://arxiv.org/pdf/2411.11389">pdf</a>, <a href="https://arxiv.org/format/2411.11389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Adapting to Cyber Threats: A Phishing Evolution Network (PEN) Framework for Phishing Generation and Analyzing Evolution Patterns using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fengchao Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingmin Wu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+V">Van Nguyen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongsheng Hu</a>, <a href="/search/cs?searchtype=author&query=Abuadbba%2C+A">Alsharif Abuadbba</a>, <a href="/search/cs?searchtype=author&query=Rudolph%2C+C">Carsten Rudolph</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11389v1-abstract-short" style="display: inline;"> Phishing remains a pervasive cyber threat, as attackers craft deceptive emails to lure victims into revealing sensitive information. While Artificial Intelligence (AI), particularly deep learning, has become a key component in defending against phishing attacks, these approaches face critical limitations. The scarcity of publicly available, diverse, and updated data, largely due to privacy concern… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11389v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11389v1-abstract-full" style="display: none;"> Phishing remains a pervasive cyber threat, as attackers craft deceptive emails to lure victims into revealing sensitive information. While Artificial Intelligence (AI), particularly deep learning, has become a key component in defending against phishing attacks, these approaches face critical limitations. The scarcity of publicly available, diverse, and updated data, largely due to privacy concerns, constrains their effectiveness. As phishing tactics evolve rapidly, models trained on limited, outdated data struggle to detect new, sophisticated deception strategies, leaving systems vulnerable to an ever-growing array of attacks. Addressing this gap is essential to strengthening defenses in an increasingly hostile cyber landscape. To address this gap, we propose the Phishing Evolution Network (PEN), a framework leveraging large language models (LLMs) and adversarial training mechanisms to continuously generate high quality and realistic diverse phishing samples, and analyze features of LLM-provided phishing to understand evolving phishing patterns. We evaluate the quality and diversity of phishing samples generated by PEN and find that it produces over 80% realistic phishing samples, effectively expanding phishing datasets across seven dominant types. These PEN-generated samples enhance the performance of current phishing detectors, leading to a 40% improvement in detection accuracy. Additionally, the use of PEN significantly boosts model robustness, reducing detectors' sensitivity to perturbations by up to 60%, thereby decreasing attack success rates under adversarial conditions. When we analyze the phishing patterns that are used in LLM-generated phishing, the cognitive complexity and the tone of time limitation are detected with statistically significant differences compared with existing phishing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11389v1-abstract-full').style.display = 'none'; document.getElementById('2411.11389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11284">arXiv:2411.11284</a> <span> [<a href="https://arxiv.org/pdf/2411.11284">pdf</a>, <a href="https://arxiv.org/format/2411.11284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dual-Frequency Filtering Self-aware Graph Neural Networks for Homophilic and Heterophilic Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yachao Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanfeng Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junbin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaofan Wang</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+F">Fujiao Ju</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Baocai Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11284v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have excelled in handling graph-structured data, attracting significant research interest. However, two primary challenges have emerged: interference between topology and attributes distorting node representations, and the low-pass filtering nature of most GNNs leading to the oversight of valuable high-frequency information in graph signals. These issues are particular… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11284v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11284v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have excelled in handling graph-structured data, attracting significant research interest. However, two primary challenges have emerged: interference between topology and attributes distorting node representations, and the low-pass filtering nature of most GNNs leading to the oversight of valuable high-frequency information in graph signals. These issues are particularly pronounced in heterophilic graphs. To address these challenges, we propose Dual-Frequency Filtering Self-aware Graph Neural Networks (DFGNN). DFGNN integrates low-pass and high-pass filters to extract smooth and detailed topological features, using frequency-specific constraints to minimize noise and redundancy in the respective frequency bands. The model dynamically adjusts filtering ratios to accommodate both homophilic and heterophilic graphs. Furthermore, DFGNN mitigates interference by aligning topological and attribute representations through dynamic correspondences between their respective frequency bands, enhancing overall model performance and expressiveness. Extensive experiments conducted on benchmark datasets demonstrate that DFGNN outperforms state-of-the-art methods in classification performance, highlighting its effectiveness in handling both homophilic and heterophilic graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11284v1-abstract-full').style.display = 'none'; document.getElementById('2411.11284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11pages,17figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11197">arXiv:2411.11197</a> <span> [<a href="https://arxiv.org/pdf/2411.11197">pdf</a>, <a href="https://arxiv.org/format/2411.11197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Stealing Training Graphs from Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+M">Minhua Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+E">Enyan Dai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Junjie Xu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jinyuan Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11197v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have shown promising results in modeling graphs in various tasks. The training of GNNs, especially on specialized tasks such as bioinformatics, demands extensive expert annotations, which are expensive and usually contain sensitive information of data providers. The trained GNN models are often shared for deployment in the real world. As neural networks can memorize th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11197v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11197v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have shown promising results in modeling graphs in various tasks. The training of GNNs, especially on specialized tasks such as bioinformatics, demands extensive expert annotations, which are expensive and usually contain sensitive information of data providers. The trained GNN models are often shared for deployment in the real world. As neural networks can memorize the training samples, the model parameters of GNNs have a high risk of leaking private training data. Our theoretical analysis shows the strong connections between trained GNN parameters and the training graphs used, confirming the training graph leakage issue. However, explorations into training data leakage from trained GNNs are rather limited. Therefore, we investigate a novel problem of stealing graphs from trained GNNs. To obtain high-quality graphs that resemble the target training set, a graph diffusion model with diffusion noise optimization is deployed as a graph generator. Furthermore, we propose a selection method that effectively leverages GNN model parameters to identify training graphs from samples generated by the graph diffusion model. Extensive experiments on real-world datasets demonstrate the effectiveness of the proposed framework in stealing training graphs from the trained GNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11197v1-abstract-full').style.display = 'none'; document.getElementById('2411.11197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be appeared in KDD 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10940">arXiv:2411.10940</a> <span> [<a href="https://arxiv.org/pdf/2411.10940">pdf</a>, <a href="https://arxiv.org/format/2411.10940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Monocular SLAM-based Multi-User Positioning System with Image Occlusion in Augmented Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lien%2C+W">Wei-Hsiang Lien</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+B+K">Benedictus Kent Chandra</a>, <a href="/search/cs?searchtype=author&query=Fischer%2C+R">Robin Fischer</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Ya-Hui Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiann-Jang Wang</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W">Wei-En Hsu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Li-Chen Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10940v1-abstract-short" style="display: inline;"> In recent years, with the rapid development of augmented reality (AR) technology, there is an increasing demand for multi-user collaborative experiences. Unlike for single-user experiences, ensuring the spatial localization of every user and maintaining synchronization and consistency of positioning and orientation across multiple users is a significant challenge. In this paper, we propose a multi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10940v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10940v1-abstract-full" style="display: none;"> In recent years, with the rapid development of augmented reality (AR) technology, there is an increasing demand for multi-user collaborative experiences. Unlike for single-user experiences, ensuring the spatial localization of every user and maintaining synchronization and consistency of positioning and orientation across multiple users is a significant challenge. In this paper, we propose a multi-user localization system based on ORB-SLAM2 using monocular RGB images as a development platform based on the Unity 3D game engine. This system not only performs user localization but also places a common virtual object on a planar surface (such as table) in the environment so that every user holds a proper perspective view of the object. These generated virtual objects serve as reference points for multi-user position synchronization. The positioning information is passed among every user's AR devices via a central server, based on which the relative position and movement of other users in the space of a specific user are presented via virtual avatars all with respect to these virtual objects. In addition, we use deep learning techniques to estimate the depth map of an image from a single RGB image to solve occlusion problems in AR applications, making virtual objects appear more natural in AR scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10940v1-abstract-full').style.display = 'none'; document.getElementById('2411.10940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10914">arXiv:2411.10914</a> <span> [<a href="https://arxiv.org/pdf/2411.10914">pdf</a>, <a href="https://arxiv.org/format/2411.10914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BPO: Towards Balanced Preference Optimization between Knowledge Breadth and Depth in Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sizhe Wang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yongqi Tong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hengyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10914v1-abstract-short" style="display: inline;"> Reinforcement Learning with Human Feedback (RLHF) is the key to the success of large language models (LLMs) in recent years. In this work, we first introduce the concepts of knowledge breadth and knowledge depth, which measure the comprehensiveness and depth of an LLM or knowledge source respectively. We reveal that the imbalance in the number of prompts and responses can lead to a potential dispa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10914v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10914v1-abstract-full" style="display: none;"> Reinforcement Learning with Human Feedback (RLHF) is the key to the success of large language models (LLMs) in recent years. In this work, we first introduce the concepts of knowledge breadth and knowledge depth, which measure the comprehensiveness and depth of an LLM or knowledge source respectively. We reveal that the imbalance in the number of prompts and responses can lead to a potential disparity in breadth and depth learning within alignment tuning datasets by showing that even a simple uniform method for balancing the number of instructions and responses can lead to significant improvements. Building on this, we further propose Balanced Preference Optimization (BPO), designed to dynamically augment the knowledge depth of each sample. BPO is motivated by the observation that the usefulness of knowledge varies across samples, necessitating tailored learning of knowledge depth. To achieve this, we introduce gradient-based clustering, estimating the knowledge informativeness and usefulness of each augmented sample based on the model's optimization direction. Our experimental results across various benchmarks demonstrate that BPO outperforms other baseline methods in alignment tuning while maintaining training efficiency. Furthermore, we conduct a detailed analysis of each component of BPO, providing guidelines for future research in preference data optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10914v1-abstract-full').style.display = 'none'; document.getElementById('2411.10914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10775">arXiv:2411.10775</a> <span> [<a href="https://arxiv.org/pdf/2411.10775">pdf</a>, <a href="https://arxiv.org/format/2411.10775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Beyond Feature Mapping GAP: Integrating Real HDRTV Priors for Superior SDRTV-to-HDRTV Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kepeng Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Li Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Gang He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenxin Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shihao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dajiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10775v1-abstract-short" style="display: inline;"> The rise of HDR-WCG display devices has highlighted the need to convert SDRTV to HDRTV, as most video sources are still in SDR. Existing methods primarily focus on designing neural networks to learn a single-style mapping from SDRTV to HDRTV. However, the limited information in SDRTV and the diversity of styles in real-world conversions render this process an ill-posed problem, thereby constrainin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10775v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10775v1-abstract-full" style="display: none;"> The rise of HDR-WCG display devices has highlighted the need to convert SDRTV to HDRTV, as most video sources are still in SDR. Existing methods primarily focus on designing neural networks to learn a single-style mapping from SDRTV to HDRTV. However, the limited information in SDRTV and the diversity of styles in real-world conversions render this process an ill-posed problem, thereby constraining the performance and generalization of these methods. Inspired by generative approaches, we propose a novel method for SDRTV to HDRTV conversion guided by real HDRTV priors. Despite the limited information in SDRTV, introducing real HDRTV as reference priors significantly constrains the solution space of the originally high-dimensional ill-posed problem. This shift transforms the task from solving an unreferenced prediction problem to making a referenced selection, thereby markedly enhancing the accuracy and reliability of the conversion process. Specifically, our approach comprises two stages: the first stage employs a Vector Quantized Generative Adversarial Network to capture HDRTV priors, while the second stage matches these priors to the input SDRTV content to recover realistic HDRTV outputs. We evaluate our method on public datasets, demonstrating its effectiveness with significant improvements in both objective and subjective metrics across real and synthetic datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10775v1-abstract-full').style.display = 'none'; document.getElementById('2411.10775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10496">arXiv:2411.10496</a> <span> [<a href="https://arxiv.org/pdf/2411.10496">pdf</a>, <a href="https://arxiv.org/format/2411.10496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> </div> </div> <p class="title is-5 mathjax"> Guided Learning: Lubricating End-to-End Modeling for Multi-stage Decision-making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jian Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Saizhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yiyan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10496v1-abstract-short" style="display: inline;"> Multi-stage decision-making is crucial in various real-world artificial intelligence applications, including recommendation systems, autonomous driving, and quantitative investment systems. In quantitative investment, for example, the process typically involves several sequential stages such as factor mining, alpha prediction, portfolio optimization, and sometimes order execution. While state-of-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10496v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10496v1-abstract-full" style="display: none;"> Multi-stage decision-making is crucial in various real-world artificial intelligence applications, including recommendation systems, autonomous driving, and quantitative investment systems. In quantitative investment, for example, the process typically involves several sequential stages such as factor mining, alpha prediction, portfolio optimization, and sometimes order execution. While state-of-the-art end-to-end modeling aims to unify these stages into a single global framework, it faces significant challenges: (1) training such a unified neural network consisting of multiple stages between initial inputs and final outputs often leads to suboptimal solutions, or even collapse, and (2) many decision-making scenarios are not easily reducible to standard prediction problems. To overcome these challenges, we propose Guided Learning, a novel methodological framework designed to enhance end-to-end learning in multi-stage decision-making. We introduce the concept of a ``guide'', a function that induces the training of intermediate neural network layers towards some phased goals, directing gradients away from suboptimal collapse. For decision scenarios lacking explicit supervisory labels, we incorporate a utility function that quantifies the ``reward'' of the throughout decision. Additionally, we explore the connections between Guided Learning and classic machine learning paradigms such as supervised, unsupervised, semi-supervised, multi-task, and reinforcement learning. Experiments on quantitative investment strategy building demonstrate that guided learning significantly outperforms both traditional stage-wise approaches and existing end-to-end methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10496v1-abstract-full').style.display = 'none'; document.getElementById('2411.10496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10351">arXiv:2411.10351</a> <span> [<a href="https://arxiv.org/pdf/2411.10351">pdf</a>, <a href="https://arxiv.org/format/2411.10351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Bias Unveiled: Investigating Social Bias in LLM-Generated Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+L">Lin Ling</a>, <a href="/search/cs?searchtype=author&query=Rabbi%2C+F">Fazle Rabbi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinqiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10351v1-abstract-short" style="display: inline;"> Large language models (LLMs) have significantly advanced the field of automated code generation. However, a notable research gap exists in the evaluation of social biases that may be present in the code produced by LLMs. To solve this issue, we propose a novel fairness framework, i.e., Solar, to assess and mitigate the social biases of LLM-generated code. Specifically, Solar can automatically gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10351v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10351v1-abstract-full" style="display: none;"> Large language models (LLMs) have significantly advanced the field of automated code generation. However, a notable research gap exists in the evaluation of social biases that may be present in the code produced by LLMs. To solve this issue, we propose a novel fairness framework, i.e., Solar, to assess and mitigate the social biases of LLM-generated code. Specifically, Solar can automatically generate test cases for quantitatively uncovering social biases of the auto-generated code by LLMs. To quantify the severity of social biases in generated code, we develop a dataset that covers a diverse set of social problems. We applied Solar and the crafted dataset to four state-of-the-art LLMs for code generation. Our evaluation reveals severe bias in the LLM-generated code from all the subject LLMs. Furthermore, we explore several strategies for bias mitigation, including Chain-of-Thought (CoT) prompting, combining positive role-playing with CoT prompting and iterative prompting. Our experiments show that iterative prompting can effectively reduce social bias in LLM-generated code by up to 90%. Solar is highly extensible to evaluate new social problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10351v1-abstract-full').style.display = 'none'; document.getElementById('2411.10351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10161">arXiv:2411.10161</a> <span> [<a href="https://arxiv.org/pdf/2411.10161">pdf</a>, <a href="https://arxiv.org/format/2411.10161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SEAGULL: No-reference Image Quality Assessment for Regions of Interest via Vision-Language Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zewen Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Juan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sunhan Xu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hang Xiong</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yun Zeng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jian Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuxun Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chunfeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Weiming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10161v1-abstract-short" style="display: inline;"> Existing Image Quality Assessment (IQA) methods achieve remarkable success in analyzing quality for overall image, but few works explore quality analysis for Regions of Interest (ROIs). The quality analysis of ROIs can provide fine-grained guidance for image quality improvement and is crucial for scenarios focusing on region-level quality. This paper proposes a novel network, SEAGULL, which can SE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10161v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10161v1-abstract-full" style="display: none;"> Existing Image Quality Assessment (IQA) methods achieve remarkable success in analyzing quality for overall image, but few works explore quality analysis for Regions of Interest (ROIs). The quality analysis of ROIs can provide fine-grained guidance for image quality improvement and is crucial for scenarios focusing on region-level quality. This paper proposes a novel network, SEAGULL, which can SEe and Assess ROIs quality with GUidance from a Large vision-Language model. SEAGULL incorporates a vision-language model (VLM), masks generated by Segment Anything Model (SAM) to specify ROIs, and a meticulously designed Mask-based Feature Extractor (MFE) to extract global and local tokens for specified ROIs, enabling accurate fine-grained IQA for ROIs. Moreover, this paper constructs two ROI-based IQA datasets, SEAGULL-100w and SEAGULL-3k, for training and evaluating ROI-based IQA. SEAGULL-100w comprises about 100w synthetic distortion images with 33 million ROIs for pre-training to improve the model's ability of regional quality perception, and SEAGULL-3k contains about 3k authentic distortion ROIs to enhance the model's ability to perceive real world distortions. After pre-training on SEAGULL-100w and fine-tuning on SEAGULL-3k, SEAGULL shows remarkable performance on fine-grained ROI quality assessment. Code and datasets are publicly available at the https://github.com/chencn2020/Seagull. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10161v1-abstract-full').style.display = 'none'; document.getElementById('2411.10161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09971">arXiv:2411.09971</a> <span> [<a href="https://arxiv.org/pdf/2411.09971">pdf</a>, <a href="https://arxiv.org/format/2411.09971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Explanation for Trajectory Planning using Multi-modal Large Language Model for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yamazaki%2C+S">Shota Yamazaki</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Nanri%2C+T">Takuya Nanri</a>, <a href="/search/cs?searchtype=author&query=Shigekane%2C+A">Akio Shigekane</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Nishiyama%2C+J">Jo Nishiyama</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+T">Tao Chu</a>, <a href="/search/cs?searchtype=author&query=Yokosawa%2C+K">Kohei Yokosawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09971v1-abstract-short" style="display: inline;"> End-to-end style autonomous driving models have been developed recently. These models lack interpretability of decision-making process from perception to control of the ego vehicle, resulting in anxiety for passengers. To alleviate it, it is effective to build a model which outputs captions describing future behaviors of the ego vehicle and their reason. However, the existing approaches generate r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09971v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09971v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09971v1-abstract-full" style="display: none;"> End-to-end style autonomous driving models have been developed recently. These models lack interpretability of decision-making process from perception to control of the ego vehicle, resulting in anxiety for passengers. To alleviate it, it is effective to build a model which outputs captions describing future behaviors of the ego vehicle and their reason. However, the existing approaches generate reasoning text that inadequately reflects the future plans of the ego vehicle, because they train models to output captions using momentary control signals as inputs. In this study, we propose a reasoning model that takes future planning trajectories of the ego vehicle as inputs to solve this limitation with the dataset newly collected. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09971v1-abstract-full').style.display = 'none'; document.getElementById('2411.09971v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted and presented at ECCV 2024 2nd Workshop on Vision-Centric Autonomous Driving (VCAD) on September 30, 2024. 13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09914">arXiv:2411.09914</a> <span> [<a href="https://arxiv.org/pdf/2411.09914">pdf</a>, <a href="https://arxiv.org/format/2411.09914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> mmSpyVR: Exploiting mmWave Radar for Penetrating Obstacles to Uncover Privacy Vulnerability of Virtual Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+L">Luoyu Mei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhimeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingchuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenchao Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Kangjie Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tian He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09914v1-abstract-short" style="display: inline;"> Virtual reality (VR), while enhancing user experiences, introduces significant privacy risks. This paper reveals a novel vulnerability in VR systems that allows attackers to capture VR privacy through obstacles utilizing millimeter-wave (mmWave) signals without physical intrusion and virtual connection with the VR devices. We propose mmSpyVR, a novel attack on VR user's privacy via mmWave radar. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09914v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09914v1-abstract-full" style="display: none;"> Virtual reality (VR), while enhancing user experiences, introduces significant privacy risks. This paper reveals a novel vulnerability in VR systems that allows attackers to capture VR privacy through obstacles utilizing millimeter-wave (mmWave) signals without physical intrusion and virtual connection with the VR devices. We propose mmSpyVR, a novel attack on VR user's privacy via mmWave radar. The mmSpyVR framework encompasses two main parts: (i) A transfer learning-based feature extraction model to achieve VR feature extraction from mmWave signal. (ii) An attention-based VR privacy spying module to spy VR privacy information from the extracted feature. The mmSpyVR demonstrates the capability to extract critical VR privacy from the mmWave signals that have penetrated through obstacles. We evaluate mmSpyVR through IRB-approved user studies. Across 22 participants engaged in four experimental scenes utilizing VR devices from three different manufacturers, our system achieves an application recognition accuracy of 98.5\% and keystroke recognition accuracy of 92.6\%. This newly discovered vulnerability has implications across various domains, such as cybersecurity, privacy protection, and VR technology development. We also engage with VR manufacturer Meta to discuss and explore potential mitigation strategies. Data and code are publicly available for scrutiny and research at https://github.com/luoyumei1-a/mmSpyVR/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09914v1-abstract-full').style.display = 'none'; document.getElementById('2411.09914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09693">arXiv:2411.09693</a> <span> [<a href="https://arxiv.org/pdf/2411.09693">pdf</a>, <a href="https://arxiv.org/format/2411.09693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CropCraft: Inverse Procedural Modeling for 3D Reconstruction of Crop Plants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+A+J">Albert J. Zhai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kaiyuan Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junxiong Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhenong Jin</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+K">Kaiyu Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shenlong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09693v1-abstract-short" style="display: inline;"> The ability to automatically build 3D digital twins of plants from images has countless applications in agriculture, environmental science, robotics, and other fields. However, current 3D reconstruction methods fail to recover complete shapes of plants due to heavy occlusion and complex geometries. In this work, we present a novel method for 3D reconstruction of agricultural crops based on optimiz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09693v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09693v1-abstract-full" style="display: none;"> The ability to automatically build 3D digital twins of plants from images has countless applications in agriculture, environmental science, robotics, and other fields. However, current 3D reconstruction methods fail to recover complete shapes of plants due to heavy occlusion and complex geometries. In this work, we present a novel method for 3D reconstruction of agricultural crops based on optimizing a parametric model of plant morphology via inverse procedural modeling. Our method first estimates depth maps by fitting a neural radiance field and then employs Bayesian optimization to estimate plant morphological parameters that result in consistent depth renderings. The resulting 3D model is complete and biologically plausible. We validate our method on a dataset of real images of agricultural fields, and demonstrate that the reconstructions can be used for a variety of monitoring and simulation applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09693v1-abstract-full').style.display = 'none'; document.getElementById('2411.09693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09217">arXiv:2411.09217</a> <span> [<a href="https://arxiv.org/pdf/2411.09217">pdf</a>, <a href="https://arxiv.org/format/2411.09217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> SmartInv: Multimodal Learning for Smart Contract Invariant Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S+J">Sally Junsong Wang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+K">Kexin Pei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junfeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09217v1-abstract-short" style="display: inline;"> Smart contracts are software programs that enable diverse business activities on the blockchain. Recent research has identified new classes of "machine un-auditable" bugs that arise from both transactional contexts and source code. Existing detection methods require human understanding of underlying transaction logic and manual reasoning across different sources of context (i.e. modalities), such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09217v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09217v1-abstract-full" style="display: none;"> Smart contracts are software programs that enable diverse business activities on the blockchain. Recent research has identified new classes of "machine un-auditable" bugs that arise from both transactional contexts and source code. Existing detection methods require human understanding of underlying transaction logic and manual reasoning across different sources of context (i.e. modalities), such as code, dynamic transaction executions, and natural language specifying the expected transaction behavior. To automate the detection of ``machine un-auditable'' bugs, we present SmartInv, an accurate and fast smart contract invariant inference framework. Our key insight is that the expected behavior of smart contracts, as specified by invariants, relies on understanding and reasoning across multimodal information, such as source code and natural language. We propose a new prompting strategy to foundation models, Tier of Thought (ToT), to reason across multiple modalities of smart contracts and ultimately to generate invariants. By checking the violation of these generated invariants, SmartInv can identify potential vulnerabilities. We evaluate SmartInv on real-world contracts and re-discover bugs that resulted in multi-million dollar losses over the past 2.5 years (from January 1, 2021 to May 31, 2023). Our extensive evaluation shows that SmartInv generates (3.5X) more bug-critical invariants and detects (4$\times$) more critical bugs compared to the state-of-the-art tools in significantly (150X) less time. \sys uncovers 119 zero-day vulnerabilities from the 89,621 real-world contracts. Among them, five are critical zero-day bugs confirmed by developers as ``high severity.'' <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09217v1-abstract-full').style.display = 'none'; document.getElementById('2411.09217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09065">arXiv:2411.09065</a> <span> [<a href="https://arxiv.org/pdf/2411.09065">pdf</a>, <a href="https://arxiv.org/format/2411.09065">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Language-Model Prior Overcomes Cold-Start Items </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yupeng Gu</a>, <a href="/search/cs?searchtype=author&query=Aydore%2C+S">Sergul Aydore</a>, <a href="/search/cs?searchtype=author&query=Kalantari%2C+K">Kousha Kalantari</a>, <a href="/search/cs?searchtype=author&query=Kveton%2C+B">Branislav Kveton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09065v1-abstract-short" style="display: inline;"> The growth of recommender systems (RecSys) is driven by digitization and the need for personalized content in areas such as e-commerce and video streaming. The content in these systems often changes rapidly and therefore they constantly face the ongoing cold-start problem, where new items lack interaction data and are hard to value. Existing solutions for the cold-start problem, such as content-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09065v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09065v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09065v1-abstract-full" style="display: none;"> The growth of recommender systems (RecSys) is driven by digitization and the need for personalized content in areas such as e-commerce and video streaming. The content in these systems often changes rapidly and therefore they constantly face the ongoing cold-start problem, where new items lack interaction data and are hard to value. Existing solutions for the cold-start problem, such as content-based recommenders and hybrid methods, leverage item metadata to determine item similarities. The main challenge with these methods is their reliance on structured and informative metadata to capture detailed item similarities, which may not always be available. This paper introduces a novel approach for cold-start item recommendation that utilizes the language model (LM) to estimate item similarities, which are further integrated as a Bayesian prior with classic recommender systems. This approach is generic and able to boost the performance of various recommenders. Specifically, our experiments integrate it with both sequential and collaborative filtering-based recommender and evaluate it on two real-world datasets, demonstrating the enhanced performance of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09065v1-abstract-full').style.display = 'none'; document.getElementById('2411.09065v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is dedicated to cold-start item recommendation using language-model priors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08887">arXiv:2411.08887</a> <span> [<a href="https://arxiv.org/pdf/2411.08887">pdf</a>, <a href="https://arxiv.org/format/2411.08887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Based CKM Construction with Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaoli Xu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yong Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08887v1-abstract-short" style="display: inline;"> Channel knowledge map (CKM) is a novel technique for achieving environment awareness, and thereby improving the communication and sensing performance for wireless systems. A fundamental problem associated with CKM is how to construct a complete CKM that provides channel knowledge for a large number of locations based solely on sparse data measurements. This problem bears similarities to the super-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08887v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08887v1-abstract-full" style="display: none;"> Channel knowledge map (CKM) is a novel technique for achieving environment awareness, and thereby improving the communication and sensing performance for wireless systems. A fundamental problem associated with CKM is how to construct a complete CKM that provides channel knowledge for a large number of locations based solely on sparse data measurements. This problem bears similarities to the super-resolution (SR) problem in image processing. In this letter, we propose an effective deep learning-based CKM construction method that leverages the image SR network known as SRResNet. Unlike most existing studies, our approach does not require any additional input beyond the sparsely measured data. In addition to the conventional path loss map construction, our approach can also be applied to construct channel angle maps (CAMs), thanks to the use of a new dataset called CKMImageNet. The numerical results demonstrate that our method outperforms interpolation-based methods such as nearest neighbour and bicubic interpolation, as well as the SRGAN method in CKM construction. Furthermore, only 1/16 of the locations need to be measured in order to achieve a root mean square error (RMSE) of 1.1 dB in path loss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08887v1-abstract-full').style.display = 'none'; document.getElementById('2411.08887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08374">arXiv:2411.08374</a> <span> [<a href="https://arxiv.org/pdf/2411.08374">pdf</a>, <a href="https://arxiv.org/format/2411.08374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Federated Graph Learning with Graphless Clients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xingbo Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Binchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08374v1-abstract-short" style="display: inline;"> Federated Graph Learning (FGL) is tasked with training machine learning models, such as Graph Neural Networks (GNNs), for multiple clients, each with its own graph data. Existing methods usually assume that each client has both node features and graph structure of its graph data. In real-world scenarios, however, there exist federated systems where only a part of the clients have such data while o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08374v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08374v1-abstract-full" style="display: none;"> Federated Graph Learning (FGL) is tasked with training machine learning models, such as Graph Neural Networks (GNNs), for multiple clients, each with its own graph data. Existing methods usually assume that each client has both node features and graph structure of its graph data. In real-world scenarios, however, there exist federated systems where only a part of the clients have such data while other clients (i.e. graphless clients) may only have node features. This naturally leads to a novel problem in FGL: how to jointly train a model over distributed graph data with graphless clients? In this paper, we propose a novel framework FedGLS to tackle the problem in FGL with graphless clients. In FedGLS, we devise a local graph learner on each graphless client which learns the local graph structure with the structure knowledge transferred from other clients. To enable structure knowledge transfer, we design a GNN model and a feature encoder on each client. During local training, the feature encoder retains the local graph structure knowledge together with the GNN model via knowledge distillation, and the structure knowledge is transferred among clients in global update. Our extensive experiments demonstrate the superiority of the proposed FedGLS over five baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08374v1-abstract-full').style.display = 'none'; document.getElementById('2411.08374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Transactions on Machine Learning Research (TMLR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08294">arXiv:2411.08294</a> <span> [<a href="https://arxiv.org/pdf/2411.08294">pdf</a>, <a href="https://arxiv.org/format/2411.08294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Participatory Research with LLM Agents in South Asia: An Empirically-Grounded Methodological Initiative and Agenda from Field Evidence in Sri Lanka </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinjie Zhao</a>, <a href="/search/cs?searchtype=author&query=Sriwarnasinghe%2C+S+M">Shyaman Maduranga Sriwarnasinghe</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiacheng Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Morikawa%2C+S">So Morikawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08294v1-abstract-short" style="display: inline;"> The integration of artificial intelligence into development research methodologies presents unprecedented opportunities for addressing persistent challenges in participatory research, particularly in linguistically diverse regions like South Asia. Drawing from an empirical implementation in Sri Lanka's Sinhala-speaking communities, this paper presents an empirically grounded methodological framewo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08294v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08294v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08294v1-abstract-full" style="display: none;"> The integration of artificial intelligence into development research methodologies presents unprecedented opportunities for addressing persistent challenges in participatory research, particularly in linguistically diverse regions like South Asia. Drawing from an empirical implementation in Sri Lanka's Sinhala-speaking communities, this paper presents an empirically grounded methodological framework designed to transform participatory development research, situated in the challenging multilingual context of Sri Lanka's flood-prone Nilwala River Basin. Moving beyond conventional translation and data collection tools, this framework deploys a multi-agent system architecture that redefines how data collection, analysis, and community engagement are conducted in linguistically and culturally diverse research settings. This structured agent-based approach enables participatory research that is both scalable and responsive, ensuring that community perspectives remain integral to research outcomes. Field experiences reveal the immense potential of LLM-based systems in addressing long-standing issues in development research across resource-limited regions, offering both quantitative efficiencies and qualitative improvements in inclusivity. At a broader methodological level, this research agenda advocates for AI-driven participatory research tools that maintain ethical considerations, cultural respect, and operational efficiency, highlighting strategic pathways for deploying AI systems that reinforce community agency and equitable knowledge generation, potentially informing broader research agendas across the Global South. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08294v1-abstract-full').style.display = 'none'; document.getElementById('2411.08294v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08135">arXiv:2411.08135</a> <span> [<a href="https://arxiv.org/pdf/2411.08135">pdf</a>, <a href="https://arxiv.org/format/2411.08135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On the Role of Speech Data in Reducing Toxicity Detection Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bell%2C+S+J">Samuel J. Bell</a>, <a href="/search/cs?searchtype=author&query=Meglioli%2C+M+C">Mariano Coria Meglioli</a>, <a href="/search/cs?searchtype=author&query=Richards%2C+M">Megan Richards</a>, <a href="/search/cs?searchtype=author&query=S%C3%A1nchez%2C+E">Eduardo S谩nchez</a>, <a href="/search/cs?searchtype=author&query=Ropers%2C+C">Christophe Ropers</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Skyler Wang</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+A">Adina Williams</a>, <a href="/search/cs?searchtype=author&query=Sagun%2C+L">Levent Sagun</a>, <a href="/search/cs?searchtype=author&query=Costa-juss%C3%A0%2C+M+R">Marta R. Costa-juss脿</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08135v1-abstract-short" style="display: inline;"> Text toxicity detection systems exhibit significant biases, producing disproportionate rates of false positives on samples mentioning demographic groups. But what about toxicity detection in speech? To investigate the extent to which text-based biases are mitigated by speech-based systems, we produce a set of high-quality group annotations for the multilingual MuTox dataset, and then leverage thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08135v1-abstract-full" style="display: none;"> Text toxicity detection systems exhibit significant biases, producing disproportionate rates of false positives on samples mentioning demographic groups. But what about toxicity detection in speech? To investigate the extent to which text-based biases are mitigated by speech-based systems, we produce a set of high-quality group annotations for the multilingual MuTox dataset, and then leverage these annotations to systematically compare speech- and text-based toxicity classifiers. Our findings indicate that access to speech data during inference supports reduced bias against group mentions, particularly for ambiguous and disagreement-inducing samples. Our results also suggest that improving classifiers, rather than transcription pipelines, is more helpful for reducing group bias. We publicly release our annotations and provide recommendations for future toxicity dataset construction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08135v1-abstract-full').style.display = 'none'; document.getElementById('2411.08135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08028">arXiv:2411.08028</a> <span> [<a href="https://arxiv.org/pdf/2411.08028">pdf</a>, <a href="https://arxiv.org/format/2411.08028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning with Less: Knowledge Distillation from Large Language Models via Unlabeled Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Juanhui Li</a>, <a href="/search/cs?searchtype=author&query=Nag%2C+S">Sreyashi Nag</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Sarwar%2C+S">Sheikh Sarwar</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Limeng Cui</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hansu Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qi He</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiliang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08028v1-abstract-short" style="display: inline;"> In real-world NLP applications, Large Language Models (LLMs) offer promising solutions due to their extensive training on vast datasets. However, the large size and high computation demands of LLMs limit their practicality in many applications, especially when further fine-tuning is required. To address these limitations, smaller models are typically preferred for deployment. However, their traini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08028v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08028v1-abstract-full" style="display: none;"> In real-world NLP applications, Large Language Models (LLMs) offer promising solutions due to their extensive training on vast datasets. However, the large size and high computation demands of LLMs limit their practicality in many applications, especially when further fine-tuning is required. To address these limitations, smaller models are typically preferred for deployment. However, their training is hindered by the scarcity of labeled data. In contrast, unlabeled data is often readily which can be leveraged by using LLMs to generate pseudo-labels for training smaller models. This enables the smaller models (student) to acquire knowledge from LLMs(teacher) while reducing computational costs. This process introduces challenges, such as potential noisy pseudo-labels. Selecting high-quality and informative data is therefore critical to enhance model performance while improving the efficiency of data utilization. To address this, we propose LLKD that enables Learning with Less computational resources and less data for Knowledge Distillation from LLMs. LLKD is an adaptive sample selection method that incorporates signals from both the teacher and student. Specifically, it prioritizes samples where the teacher demonstrates high confidence in its labeling, indicating reliable labels, and where the student exhibits a high information need, identifying challenging samples that require further learning. Our comprehensive experiments show that LLKD achieves superior performance across various datasets with higher data efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08028v1-abstract-full').style.display = 'none'; document.getElementById('2411.08028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07763">arXiv:2411.07763</a> <span> [<a href="https://arxiv.org/pdf/2411.07763">pdf</a>, <a href="https://arxiv.org/format/2411.07763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+F">Fangyu Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuxiao Ye</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+R">Ruisheng Cao</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+D">Dongchan Shin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongjin Su</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+Z">Zhaoqing Suo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenjing Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+P">Pengcheng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+V">Victor Zhong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sida Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07763v1-abstract-short" style="display: inline;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07763v1-abstract-full" style="display: none;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spider 2.0 are sourced from real data applications, often containing over 1,000 columns and stored in local or cloud database systems such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 frequently requires understanding and searching through database metadata, dialect documentation, and even project-level codebases. This challenge calls for models to interact with complex SQL workflow environments, process extremely long contexts, perform intricate reasoning, and generate multiple SQL queries with diverse operations, often exceeding 100 lines, which goes far beyond traditional text-to-SQL challenges. Our evaluations indicate that based on o1-preview, our code agent framework successfully solves only 17.0% of the tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on Spider 2.0 show that while language models have demonstrated remarkable performance in code generation -- especially in prior text-to-SQL benchmarks -- they require significant improvement in order to achieve adequate performance for real-world enterprise usage. Progress on Spider 2.0 represents crucial steps towards developing intelligent, autonomous, code agents for real-world enterprise settings. Our code, baseline models, and data are available at https://spider2-sql.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'none'; document.getElementById('2411.07763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07506">arXiv:2411.07506</a> <span> [<a href="https://arxiv.org/pdf/2411.07506">pdf</a>, <a href="https://arxiv.org/format/2411.07506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FM-TS: Flow Matching for Time Series Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yang Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huatian Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07506v1-abstract-short" style="display: inline;"> Time series generation has emerged as an essential tool for analyzing temporal data across numerous fields. While diffusion models have recently gained significant attention in generating high-quality time series, they tend to be computationally demanding and reliant on complex stochastic processes. To address these limitations, we introduce FM-TS, a rectified Flow Matching-based framework for Tim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07506v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07506v1-abstract-full" style="display: none;"> Time series generation has emerged as an essential tool for analyzing temporal data across numerous fields. While diffusion models have recently gained significant attention in generating high-quality time series, they tend to be computationally demanding and reliant on complex stochastic processes. To address these limitations, we introduce FM-TS, a rectified Flow Matching-based framework for Time Series generation, which simplifies the time series generation process by directly optimizing continuous trajectories. This approach avoids the need for iterative sampling or complex noise schedules typically required in diffusion-based models. FM-TS is more efficient in terms of training and inference. Moreover, FM-TS is highly adaptive, supporting both conditional and unconditional time series generation. Notably, through our novel inference design, the model trained in an unconditional setting can seamlessly generalize to conditional tasks without the need for retraining. Extensive benchmarking across both settings demonstrates that FM-TS consistently delivers superior performance compared to existing approaches while being more efficient in terms of training and inference. For instance, in terms of discriminative score, FM-TS achieves 0.005, 0.019, 0.011, 0.005, 0.053, and 0.106 on the Sines, Stocks, ETTh, MuJoCo, Energy, and fMRI unconditional time series datasets, respectively, significantly outperforming the second-best method which achieves 0.006, 0.067, 0.061, 0.008, 0.122, and 0.167 on the same datasets. We have achieved superior performance in solar forecasting and MuJoCo imputation tasks, significantly enhanced by our innovative $t$ power sampling method. The code is available at https://github.com/UNITES-Lab/FMTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07506v1-abstract-full').style.display = 'none'; document.getElementById('2411.07506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07360">arXiv:2411.07360</a> <span> [<a href="https://arxiv.org/pdf/2411.07360">pdf</a>, <a href="https://arxiv.org/format/2411.07360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> ChatGPT Inaccuracy Mitigation during Technical Report Understanding: Are We There Yet? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tamanna%2C+S+B">Salma Begum Tamanna</a>, <a href="/search/cs?searchtype=author&query=Uddin%2C+G">Gias Uddin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+L">Lan Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07360v1-abstract-short" style="display: inline;"> Hallucinations, the tendency to produce irrelevant/incorrect responses, are prevalent concerns in generative AI-based tools like ChatGPT. Although hallucinations in ChatGPT are studied for textual responses, it is unknown how ChatGPT hallucinates for technical texts that contain both textual and technical terms. We surveyed 47 software engineers and produced a benchmark of 412 Q&A pairs from the b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07360v1-abstract-full" style="display: none;"> Hallucinations, the tendency to produce irrelevant/incorrect responses, are prevalent concerns in generative AI-based tools like ChatGPT. Although hallucinations in ChatGPT are studied for textual responses, it is unknown how ChatGPT hallucinates for technical texts that contain both textual and technical terms. We surveyed 47 software engineers and produced a benchmark of 412 Q&A pairs from the bug reports of two OSS projects. We find that a RAG-based ChatGPT (i.e., ChatGPT tuned with the benchmark issue reports) is 36.4% correct when producing answers to the questions, due to two reasons 1) limitations to understand complex technical contents in code snippets like stack traces, and 2) limitations to integrate contexts denoted in the technical terms and texts. We present CHIME (ChatGPT Inaccuracy Mitigation Engine) whose underlying principle is that if we can preprocess the technical reports better and guide the query validation process in ChatGPT, we can address the observed limitations. CHIME uses context-free grammar (CFG) to parse stack traces in technical reports. CHIME then verifies and fixes ChatGPT responses by applying metamorphic testing and query transformation. In our benchmark, CHIME shows 30.3% more correction over ChatGPT responses. In a user study, we find that the improved responses with CHIME are considered more useful than those generated from ChatGPT without CHIME. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07360v1-abstract-full').style.display = 'none'; document.getElementById('2411.07360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07123">arXiv:2411.07123</a> <span> [<a href="https://arxiv.org/pdf/2411.07123">pdf</a>, <a href="https://arxiv.org/format/2411.07123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fast and Robust Contextual Node Representation Learning over Dynamic Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xingzhi Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Silong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Baojian Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanghua Xiao</a>, <a href="/search/cs?searchtype=author&query=Skiena%2C+S">Steven Skiena</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07123v1-abstract-short" style="display: inline;"> Real-world graphs grow rapidly with edge and vertex insertions over time, motivating the problem of efficiently maintaining robust node representation over evolving graphs. Recent efficient GNNs are designed to decouple recursive message passing from the learning process, and favor Personalized PageRank (PPR) as the underlying feature propagation mechanism. However, most PPR-based GNNs are designe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07123v1-abstract-full" style="display: none;"> Real-world graphs grow rapidly with edge and vertex insertions over time, motivating the problem of efficiently maintaining robust node representation over evolving graphs. Recent efficient GNNs are designed to decouple recursive message passing from the learning process, and favor Personalized PageRank (PPR) as the underlying feature propagation mechanism. However, most PPR-based GNNs are designed for static graphs, and efficient PPR maintenance remains as an open problem. Further, there is surprisingly little theoretical justification for the choice of PPR, despite its impressive empirical performance. In this paper, we are inspired by the recent PPR formulation as an explicit $\ell_1$-regularized optimization problem and propose a unified dynamic graph learning framework based on sparse node-wise attention. We also present a set of desired properties to justify the choice of PPR in STOA GNNs, and serves as the guideline for future node attention designs. Meanwhile, we take advantage of the PPR-equivalent optimization formulation and employ the proximal gradient method (ISTA) to improve the efficiency of PPR-based GNNs upto 6 times. Finally, we instantiate a simple-yet-effective model (\textsc{GoPPE}) with robust positional encodings by maximizing PPR previously used as attention. The model performs comparably to or better than the STOA baselines and greatly outperforms when the initial node attributes are noisy during graph evolution, demonstrating the effectiveness and robustness of \textsc{GoPPE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07123v1-abstract-full').style.display = 'none'; document.getElementById('2411.07123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06655">arXiv:2411.06655</a> <span> [<a href="https://arxiv.org/pdf/2411.06655">pdf</a>, <a href="https://arxiv.org/format/2411.06655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Explore the Reasoning Capability of LLMs in the Chess Testbed </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+L">Lei Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Renxi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenxiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haokun Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yifan Hou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y+N">Ying Nian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06655v1-abstract-short" style="display: inline;"> Reasoning is a central capability of human intelligence. In recent years, with the advent of large-scale datasets, pretrained large language models have emerged with new capabilities, including reasoning. However, these models still struggle with long-term, complex reasoning tasks, such as playing chess. Based on the observation that expert chess players employ a dual approach combining long-term… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06655v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06655v1-abstract-full" style="display: none;"> Reasoning is a central capability of human intelligence. In recent years, with the advent of large-scale datasets, pretrained large language models have emerged with new capabilities, including reasoning. However, these models still struggle with long-term, complex reasoning tasks, such as playing chess. Based on the observation that expert chess players employ a dual approach combining long-term strategic play with short-term tactical play along with language explanation, we propose improving the reasoning capability of large language models in chess by integrating annotated strategy and tactic. Specifically, we collect a dataset named MATE, which consists of 1 million chess positions with candidate moves annotated by chess experts for strategy and tactics. We finetune the LLaMA-3-8B model and compare it against state-of-the-art commercial language models in the task of selecting better chess moves. Our experiments show that our models perform better than GPT, Claude, and Gemini models. We find that language explanations can enhance the reasoning capability of large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06655v1-abstract-full').style.display = 'none'; document.getElementById('2411.06655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to NAACL2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06638">arXiv:2411.06638</a> <span> [<a href="https://arxiv.org/pdf/2411.06638">pdf</a>, <a href="https://arxiv.org/format/2411.06638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Model Editing for LLMs4Code: How Far are We? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaopeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shasha Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jie Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+B">Bin Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weimin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06638v1-abstract-short" style="display: inline;"> Large Language Models for Code (LLMs4Code) have been found to exhibit outstanding performance in the software engineering domain, especially the remarkable performance in coding tasks. However, even the most advanced LLMs4Code can inevitably contain incorrect or outdated code knowledge. Due to the high cost of training LLMs4Code, it is impractical to re-train the models for fixing these problemati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06638v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06638v1-abstract-full" style="display: none;"> Large Language Models for Code (LLMs4Code) have been found to exhibit outstanding performance in the software engineering domain, especially the remarkable performance in coding tasks. However, even the most advanced LLMs4Code can inevitably contain incorrect or outdated code knowledge. Due to the high cost of training LLMs4Code, it is impractical to re-train the models for fixing these problematic code knowledge. Model editing is a new technical field for effectively and efficiently correcting erroneous knowledge in LLMs, where various model editing techniques and benchmarks have been proposed recently. Despite that, a comprehensive study that thoroughly compares and analyzes the performance of the state-of-the-art model editing techniques for adapting the knowledge within LLMs4Code across various code-related tasks is notably absent. To bridge this gap, we perform the first systematic study on applying state-of-the-art model editing approaches to repair the inaccuracy of LLMs4Code. To that end, we introduce a benchmark named CLMEEval, which consists of two datasets, i.e., CoNaLa-Edit (CNLE) with 21K+ code generation samples and CodeSearchNet-Edit (CSNE) with 16K+ code summarization samples. With the help of CLMEEval, we evaluate six advanced model editing techniques on three LLMs4Code: CodeLlama (7B), CodeQwen1.5 (7B), and Stable-Code (3B). Our findings include that the external memorization-based GRACE approach achieves the best knowledge editing effectiveness and specificity (the editing does not influence untargeted knowledge), while generalization (whether the editing can generalize to other semantically-identical inputs) is a universal challenge for existing techniques. Furthermore, building on in-depth case analysis, we introduce an enhanced version of GRACE called A-GRACE, which incorporates contrastive learning to better capture the semantics of the inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06638v1-abstract-full').style.display = 'none'; document.getElementById('2411.06638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICSE2025. The code is available at: https://github.com/xpq-tech/code-llmedit.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06444">arXiv:2411.06444</a> <span> [<a href="https://arxiv.org/pdf/2411.06444">pdf</a>, <a href="https://arxiv.org/format/2411.06444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SamRobNODDI: Q-Space Sampling-Augmented Continuous Representation Learning for Robust and Generalized NODDI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Taohui Xiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenxin Fan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+E">Enqing Dong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shanshan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06444v1-abstract-short" style="display: inline;"> Neurite Orientation Dispersion and Density Imaging (NODDI) microstructure estimation from diffusion magnetic resonance imaging (dMRI) is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods accelerate the speed of NODDI parameter estimation and improve the accuracy. However, most methods require the number and coordinates of gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06444v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06444v1-abstract-full" style="display: none;"> Neurite Orientation Dispersion and Density Imaging (NODDI) microstructure estimation from diffusion magnetic resonance imaging (dMRI) is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods accelerate the speed of NODDI parameter estimation and improve the accuracy. However, most methods require the number and coordinates of gradient directions during testing and training to remain strictly consistent, significantly limiting the generalization and robustness of these models in NODDI parameter estimation. In this paper, we propose a q-space sampling augmentation-based continuous representation learning framework (SamRobNODDI) to achieve robust and generalized NODDI. Specifically, a continuous representation learning method based on q-space sampling augmentation is introduced to fully explore the information between different gradient directions in q-space. Furthermore, we design a sampling consistency loss to constrain the outputs of different sampling schemes, ensuring that the outputs remain as consistent as possible, thereby further enhancing performance and robustness to varying q-space sampling schemes. SamRobNODDI is also a flexible framework that can be applied to different backbone networks. To validate the effectiveness of the proposed method, we compared it with 7 state-of-the-art methods across 18 different q-space sampling schemes, demonstrating that the proposed SamRobNODDI has better performance, robustness, generalization, and flexibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06444v1-abstract-full').style.display = 'none'; document.getElementById('2411.06444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06326">arXiv:2411.06326</a> <span> [<a href="https://arxiv.org/pdf/2411.06326">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Emotion-Aware Interaction Design in Intelligent User Interface Using Multi-Modal Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+S">Shiyu Duan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shixiao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengmeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Runsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06326v1-abstract-short" style="display: inline;"> In an era where user interaction with technology is ubiquitous, the importance of user interface (UI) design cannot be overstated. A well-designed UI not only enhances usability but also fosters more natural, intuitive, and emotionally engaging experiences, making technology more accessible and impactful in everyday life. This research addresses this growing need by introducing an advanced emotion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06326v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06326v1-abstract-full" style="display: none;"> In an era where user interaction with technology is ubiquitous, the importance of user interface (UI) design cannot be overstated. A well-designed UI not only enhances usability but also fosters more natural, intuitive, and emotionally engaging experiences, making technology more accessible and impactful in everyday life. This research addresses this growing need by introducing an advanced emotion recognition system to significantly improve the emotional responsiveness of UI. By integrating facial expressions, speech, and textual data through a multi-branch Transformer model, the system interprets complex emotional cues in real-time, enabling UIs to interact more empathetically and effectively with users. Using the public MELD dataset for validation, our model demonstrates substantial improvements in emotion recognition accuracy and F1 scores, outperforming traditional methods. These findings underscore the critical role that sophisticated emotion recognition plays in the evolution of UIs, making technology more attuned to user needs and emotions. This study highlights how enhanced emotional intelligence in UIs is not only about technical innovation but also about fostering deeper, more meaningful connections between users and the digital world, ultimately shaping how people interact with technology in their daily lives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06326v1-abstract-full').style.display = 'none'; document.getElementById('2411.06326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06272">arXiv:2411.06272</a> <span> [<a href="https://arxiv.org/pdf/2411.06272">pdf</a>, <a href="https://arxiv.org/format/2411.06272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Golden Touchstone: A Comprehensive Bilingual Benchmark for Evaluating Financial Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaojun Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junxi Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Huanyi Su</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouchi Lin</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yiyan Qi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengjin Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiajun Su</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiajie Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Saizhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+F">Fengrui Hua</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06272v1-abstract-short" style="display: inline;"> As large language models become increasingly prevalent in the financial sector, there is a pressing need for a standardized method to comprehensively assess their performance. However, existing finance benchmarks often suffer from limited language and task coverage, as well as challenges such as low-quality datasets and inadequate adaptability for LLM evaluation. To address these limitations, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06272v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06272v1-abstract-full" style="display: none;"> As large language models become increasingly prevalent in the financial sector, there is a pressing need for a standardized method to comprehensively assess their performance. However, existing finance benchmarks often suffer from limited language and task coverage, as well as challenges such as low-quality datasets and inadequate adaptability for LLM evaluation. To address these limitations, we propose "Golden Touchstone", the first comprehensive bilingual benchmark for financial LLMs, which incorporates representative datasets from both Chinese and English across eight core financial NLP tasks. Developed from extensive open source data collection and industry-specific demands, this benchmark includes a variety of financial tasks aimed at thoroughly assessing models' language understanding and generation capabilities. Through comparative analysis of major models on the benchmark, such as GPT-4o Llama3, FinGPT and FinMA, we reveal their strengths and limitations in processing complex financial information. Additionally, we open-sourced Touchstone-GPT, a financial LLM trained through continual pre-training and financial instruction tuning, which demonstrates strong performance on the bilingual benchmark but still has limitations in specific tasks.This research not only provides the financial large language models with a practical evaluation tool but also guides the development and optimization of future research. The source code for Golden Touchstone and model weight of Touchstone-GPT have been made publicly available at \url{https://github.com/IDEA-FinAI/Golden-Touchstone}, contributing to the ongoing evolution of FinLLMs and fostering further research in this critical area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06272v1-abstract-full').style.display = 'none'; document.getElementById('2411.06272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 9 tables, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository