Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 552 results for author: <span class="mathjax">Guo, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Guo%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Guo, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Guo%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Guo, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Guo%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16300">arXiv:2411.16300</a> <span> [<a href="https://arxiv.org/pdf/2411.16300">pdf</a>, <a href="https://arxiv.org/format/2411.16300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BayLing 2: A Multilingual Large Language Model with Efficient Language Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaolei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kehao Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Q">Qingkai Fang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shoutao Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yang Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16300v1-abstract-short" style="display: inline;"> Large language models (LLMs), with their powerful generative capabilities and vast knowledge, empower various tasks in everyday life. However, these abilities are primarily concentrated in high-resource languages, leaving low-resource languages with weaker generative capabilities and relatively limited knowledge. Enhancing the multilingual capabilities of LLMs is therefore crucial for serving over… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16300v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16300v1-abstract-full" style="display: none;"> Large language models (LLMs), with their powerful generative capabilities and vast knowledge, empower various tasks in everyday life. However, these abilities are primarily concentrated in high-resource languages, leaving low-resource languages with weaker generative capabilities and relatively limited knowledge. Enhancing the multilingual capabilities of LLMs is therefore crucial for serving over 100 linguistic communities worldwide. An intuitive approach to enhance the multilingual capabilities would be to construct instruction data for various languages, but constructing instruction data for over 100 languages is prohibitively costly. In this paper, we introduce BayLing 2, which efficiently transfers generative capabilities and knowledge from high-resource languages to low-resource languages through language alignment. To achieve this, we constructed a dataset of 3.2 million instructions, comprising high-resource language instructions (Chinese and English) and cross-lingual instructions for 100+ languages and performed instruction tuning based on the dataset to facilitate the capability transfer between languages. Using Llama as the foundation model, we developed BayLing-2-7B, BayLing-2-13B, and BayLing-3-8B, and conducted a comprehensive evaluation of BayLing. For multilingual translation across 100+ languages, BayLing shows superior performance compared to open-source models of similar scale. For multilingual knowledge and understanding benchmarks, BayLing achieves significant improvements across over 20 low-resource languages, demonstrating its capability of effective knowledge transfer from high-resource to low-resource languages. Furthermore, results on English benchmarks indicate that BayLing maintains high performance in highresource languages while enhancing the performance in low-resource languages. Demo, homepage, code and models of BayLing are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16300v1-abstract-full').style.display = 'none'; document.getElementById('2411.16300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BayLing 2's online demo: http://nlp.ict.ac.cn/bayling/demo. BayLing 2's code and models: https://github.com/ictnlp/BayLing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15781">arXiv:2411.15781</a> <span> [<a href="https://arxiv.org/pdf/2411.15781">pdf</a>, <a href="https://arxiv.org/format/2411.15781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Multi-user Offloading of Personalized Diffusion Models: A DRL-Convex Hybrid Solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wanting Yang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">Merouane Debbah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15781v1-abstract-short" style="display: inline;"> With the impressive generative capabilities of diffusion models, personalized content synthesis has emerged as the most highly anticipated. However, the large model sizes and iterative nature of inference make it difficult to deploy personalized diffusion models broadly on local devices with varying computational power. To this end, we propose a novel framework for efficient multi-user offloading… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15781v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15781v1-abstract-full" style="display: none;"> With the impressive generative capabilities of diffusion models, personalized content synthesis has emerged as the most highly anticipated. However, the large model sizes and iterative nature of inference make it difficult to deploy personalized diffusion models broadly on local devices with varying computational power. To this end, we propose a novel framework for efficient multi-user offloading of personalized diffusion models, given a variable number of users, diverse user computational capabilities, and fluctuating available computational resources on the edge server. To enhance computational efficiency and reduce storage burden on edge servers, we first propose a tailored multi-user hybrid inference manner, where the inference process for each user is split into two phases with an optimizable split point. The initial phase of inference is processed on a cluster-wide model using batching techniques, generating low-level semantic information corresponding to each user's prompt. Then, the users employ their own personalized model to add further details in the later inference phase. Given the constraints on edge server computational resources and users' preferences for low latency and high accuracy, we model the joint optimization of each user's offloading request handling and split point as an extension of the Generalized Quadratic Assignment Problem (GQAP). Our objective is to maximize a comprehensive metric that accounts for both latency and accuracy across all users. To tackle this NP-hard problem, we transform the GQAP into an adaptive decision sequence, model it as a Markov decision process, and develop a hybrid solution combining deep reinforcement learning with convex optimization techniques. Simulation results validate the effectiveness of our framework, demonstrating superior optimality and low complexity compared to traditional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15781v1-abstract-full').style.display = 'none'; document.getElementById('2411.15781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15419">arXiv:2411.15419</a> <span> [<a href="https://arxiv.org/pdf/2411.15419">pdf</a>, <a href="https://arxiv.org/format/2411.15419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Communication-Efficient Sparsely-Activated Model Training via Sequence Migration and Token Condensation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fahao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zicong Hong</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhou Su</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15419v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) is an emerging technique for scaling large models with sparse activation. MoE models are typically trained in a distributed manner with an expert parallelism scheme, where experts in each MoE layer are distributed across multiple GPUs. However, the default expert parallelism suffers from the heavy network burden due to the all-to-all intermediate data exchange among GPUs b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15419v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15419v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) is an emerging technique for scaling large models with sparse activation. MoE models are typically trained in a distributed manner with an expert parallelism scheme, where experts in each MoE layer are distributed across multiple GPUs. However, the default expert parallelism suffers from the heavy network burden due to the all-to-all intermediate data exchange among GPUs before and after the expert run. Some existing works have proposed to reduce intermediate data exchanges by transferring experts to reduce the network loads, however, which would decrease parallelism level of expert execution and make computation inefficient. The weaknesses of existing works motivate us to explore whether it is possible to reduce inter-GPU traffic while maintaining a high degree of expert parallelism. This paper gives a positive response by presenting Luffy, a communication-efficient distributed MoE training system with two new techniques. First, Luffy migrates sequences among GPUs to hide heavy token pulling paths within GPUs and avoid copying experts over GPUs. Second, we propose token condensation that identifies similar tokens and then eliminates redundant transmissions. We implement Luffy based on PyTorch and evaluate its performance on a testbed of 16 V100 GPUs. Luffy system can achieve a speedup of up to 2.73x compared to state-of-the-art MoE training systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15419v1-abstract-full').style.display = 'none'; document.getElementById('2411.15419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14228">arXiv:2411.14228</a> <span> [<a href="https://arxiv.org/pdf/2411.14228">pdf</a>, <a href="https://arxiv.org/format/2411.14228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FocusLLaVA: A Coarse-to-Fine Approach for Efficient and Effective Visual Token Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chi Xie</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shuang Liang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Sheng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14228v1-abstract-short" style="display: inline;"> Recent advances on Multi-modal Large Language Models have demonstrated that high-resolution image input is crucial for model capabilities, especially for fine-grained tasks. However, high-resolution images lead to a quadratic increase in the number of visual tokens input into LLMs, resulting in significant computational costs. Current work develop visual token compression methods to achieve effici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14228v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14228v1-abstract-full" style="display: none;"> Recent advances on Multi-modal Large Language Models have demonstrated that high-resolution image input is crucial for model capabilities, especially for fine-grained tasks. However, high-resolution images lead to a quadratic increase in the number of visual tokens input into LLMs, resulting in significant computational costs. Current work develop visual token compression methods to achieve efficiency improvements, often at the expense of performance. We argue that removing visual redundancy can simultaneously improve both efficiency and performance. We build a coarse-to-fine visual token compression method, with a vision-guided sampler for compressing redundant regions with low information density, and a text-guided sampler for selecting visual tokens that are strongly correlated with the user instructions.With these two modules, the proposed FocusLLaVA achieves improvements in both efficiency and performance. We validate the effectiveness of our approach on a wide range of evaluation datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14228v1-abstract-full').style.display = 'none'; document.getElementById('2411.14228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13979">arXiv:2411.13979</a> <span> [<a href="https://arxiv.org/pdf/2411.13979">pdf</a>, <a href="https://arxiv.org/format/2411.13979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FedRAV: Hierarchically Federated Region-Learning for Traffic Object Classification of Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yijun Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pengzhan Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuepeng He</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+F">Fang Qu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhida Qin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+X">Xianlong Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guiyan Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Songtao Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13979v1-abstract-short" style="display: inline;"> The emerging federated learning enables distributed autonomous vehicles to train equipped deep learning models collaboratively without exposing their raw data, providing great potential for utilizing explosively growing autonomous driving data. However, considering the complicated traffic environments and driving scenarios, deploying federated learning for autonomous vehicles is inevitably challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13979v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13979v1-abstract-full" style="display: none;"> The emerging federated learning enables distributed autonomous vehicles to train equipped deep learning models collaboratively without exposing their raw data, providing great potential for utilizing explosively growing autonomous driving data. However, considering the complicated traffic environments and driving scenarios, deploying federated learning for autonomous vehicles is inevitably challenged by non-independent and identically distributed (Non-IID) data of vehicles, which may lead to failed convergence and low training accuracy. In this paper, we propose a novel hierarchically Federated Region-learning framework of Autonomous Vehicles (FedRAV), a two-stage framework, which adaptively divides a large area containing vehicles into sub-regions based on the defined region-wise distance, and achieves personalized vehicular models and regional models. This approach ensures that the personalized vehicular model adopts the beneficial models while discarding the unprofitable ones. We validate our FedRAV framework against existing federated learning algorithms on three real-world autonomous driving datasets in various heterogeneous settings. The experiment results demonstrate that our framework outperforms those known algorithms, and improves the accuracy by at least 3.69%. The source code of FedRAV is available at: https://github.com/yjzhai-cs/FedRAV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13979v1-abstract-full').style.display = 'none'; document.getElementById('2411.13979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11288">arXiv:2411.11288</a> <span> [<a href="https://arxiv.org/pdf/2411.11288">pdf</a>, <a href="https://arxiv.org/format/2411.11288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neuron: Learning Context-Aware Evolving Representations for Zero-Shot Skeleton Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingcai Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11288v2-abstract-short" style="display: inline;"> Zero-shot skeleton action recognition is a non-trivial task that requires robust unseen generalization with prior knowledge from only seen classes and shared semantics. Existing methods typically build the skeleton-semantics interactions by uncontrollable mappings and conspicuous representations, thereby can hardly capture the intricate and fine-grained relationship for effective cross-modal trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11288v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11288v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11288v2-abstract-full" style="display: none;"> Zero-shot skeleton action recognition is a non-trivial task that requires robust unseen generalization with prior knowledge from only seen classes and shared semantics. Existing methods typically build the skeleton-semantics interactions by uncontrollable mappings and conspicuous representations, thereby can hardly capture the intricate and fine-grained relationship for effective cross-modal transferability. To address these issues, we propose a novel dyNamically Evolving dUal skeleton-semantic syneRgistic framework with the guidance of cOntext-aware side informatioN (dubbed Neuron), to explore more fine-grained cross-modal correspondence from micro to macro perspectives at both spatial and temporal levels, respectively. Concretely, 1) we first construct the spatial-temporal evolving micro-prototypes and integrate dynamic context-aware side information to capture the intricate and synergistic skeleton-semantic correlations step-by-step, progressively refining cross-model alignment; and 2) we introduce the spatial compression and temporal memory mechanisms to guide the growth of spatial-temporal micro-prototypes, enabling them to absorb structure-related spatial representations and regularity-dependent temporal patterns. Notably, such processes are analogous to the learning and growth of neurons, equipping the framework with the capacity to generalize to novel unseen action categories. Extensive experiments on various benchmark datasets demonstrated the superiority of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11288v2-abstract-full').style.display = 'none'; document.getElementById('2411.11288v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09439">arXiv:2411.09439</a> <span> [<a href="https://arxiv.org/pdf/2411.09439">pdf</a>, <a href="https://arxiv.org/format/2411.09439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spider: Any-to-Many Multimodal LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+J">Jinxiang Lai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiaocheng Lu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09439v1-abstract-short" style="display: inline;"> Multimodal LLMs (MLLMs) have emerged as an extension of Large Language Models (LLMs), enabling the integration of various modalities. However, Any-to-Any MLLMs are limited to generating pairwise modalities 'Text + X' within a single response, such as Text + {Image or Audio or Video}. To address this limitation, we introduce Spider, a novel efficient Any-to-Many Modalities Generation (AMMG) framewo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09439v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09439v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09439v1-abstract-full" style="display: none;"> Multimodal LLMs (MLLMs) have emerged as an extension of Large Language Models (LLMs), enabling the integration of various modalities. However, Any-to-Any MLLMs are limited to generating pairwise modalities 'Text + X' within a single response, such as Text + {Image or Audio or Video}. To address this limitation, we introduce Spider, a novel efficient Any-to-Many Modalities Generation (AMMG) framework, which can generate an arbitrary combination of modalities 'Text + Xs', such as Text + {Image and Audio and Video}. To achieve efficient AMMG, our Spider integrates three core components: a Base Model for basic X-to-X (i.e., Any-to-Any) modality processing, a novel Efficient Decoders-Controller for controlling multimodal Decoders to generate Xs (many-modal) contents, and an Any-to-Many Instruction Template designed for producing Xs signal prompts. To train Spider, we constructed a novel Text-formatted Many-Modal (TMM) dataset, which facilitates the learning of the X-to-Xs (i.e., Any-to-Many) capability necessary for AMMG. Ultimately, the well-trained Spider generates a pseudo X-to-Xs dataset, the first-ever X-to-Xs many-modal dataset, enhancing the potential for AMMG task in future research. Overall, this work not only pushes the boundary of multimodal interaction but also provides rich data support for advancing the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09439v1-abstract-full').style.display = 'none'; document.getElementById('2411.09439v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07407">arXiv:2411.07407</a> <span> [<a href="https://arxiv.org/pdf/2411.07407">pdf</a>, <a href="https://arxiv.org/format/2411.07407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Using Generative AI and Multi-Agents to Provide Automatic Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Latif%2C+E">Ehsan Latif</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoming Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07407v1-abstract-short" style="display: inline;"> This study investigates the use of generative AI and multi-agent systems to provide automatic feedback in educational contexts, particularly for student constructed responses in science assessments. The research addresses a key gap in the field by exploring how multi-agent systems, called AutoFeedback, can improve the quality of GenAI-generated feedback, overcoming known issues such as over-praise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07407v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07407v1-abstract-full" style="display: none;"> This study investigates the use of generative AI and multi-agent systems to provide automatic feedback in educational contexts, particularly for student constructed responses in science assessments. The research addresses a key gap in the field by exploring how multi-agent systems, called AutoFeedback, can improve the quality of GenAI-generated feedback, overcoming known issues such as over-praise and over-inference that are common in single-agent large language models (LLMs). The study developed a multi-agent system consisting of two AI agents: one for generating feedback and another for validating and refining it. The system was tested on a dataset of 240 student responses, and its performance was compared to that of a single-agent LLM. Results showed that AutoFeedback significantly reduced the occurrence of over-praise and over-inference errors, providing more accurate and pedagogically sound feedback. The findings suggest that multi-agent systems can offer a more reliable solution for generating automated feedback in educational settings, highlighting their potential for scalable and personalized learning support. These results have important implications for educators and researchers seeking to leverage AI in formative assessments, offering a pathway to more effective feedback mechanisms that enhance student learning outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07407v1-abstract-full').style.display = 'none'; document.getElementById('2411.07407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05261">arXiv:2411.05261</a> <span> [<a href="https://arxiv.org/pdf/2411.05261">pdf</a>, <a href="https://arxiv.org/format/2411.05261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoding Report Generators: A Cyclic Vision-Language Adapter for Counterfactual Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yingying Fang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zihao Jin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shaojie Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinda Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yijian Gao</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+J">Junzhi Ning</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Z">Zhiling Yue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+S+L">Simon LF Walsh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05261v1-abstract-short" style="display: inline;"> Despite significant advancements in report generation methods, a critical limitation remains: the lack of interpretability in the generated text. This paper introduces an innovative approach to enhance the explainability of text generated by report generation models. Our method employs cyclic text manipulation and visual comparison to identify and elucidate the features in the original content tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05261v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05261v1-abstract-full" style="display: none;"> Despite significant advancements in report generation methods, a critical limitation remains: the lack of interpretability in the generated text. This paper introduces an innovative approach to enhance the explainability of text generated by report generation models. Our method employs cyclic text manipulation and visual comparison to identify and elucidate the features in the original content that influence the generated text. By manipulating the generated reports and producing corresponding images, we create a comparative framework that highlights key attributes and their impact on the text generation process. This approach not only identifies the image features aligned to the generated text but also improves transparency but also provides deeper insights into the decision-making mechanisms of the report generation models. Our findings demonstrate the potential of this method to significantly enhance the interpretability and transparency of AI-generated reports. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05261v1-abstract-full').style.display = 'none'; document.getElementById('2411.05261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02429">arXiv:2411.02429</a> <span> [<a href="https://arxiv.org/pdf/2411.02429">pdf</a>, <a href="https://arxiv.org/format/2411.02429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> IdeaBench: Benchmarking Large Language Models for Research Idea Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Sikun Guo</a>, <a href="/search/cs?searchtype=author&query=Shariatmadari%2C+A+H">Amir Hassan Shariatmadari</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guangzhi Xiong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+A">Albert Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+E">Eric Xie</a>, <a href="/search/cs?searchtype=author&query=Bekiranov%2C+S">Stefan Bekiranov</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aidong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02429v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have transformed how people interact with artificial intelligence (AI) systems, achieving state-of-the-art results in various tasks, including scientific discovery and hypothesis generation. However, the lack of a comprehensive and systematic evaluation framework for generating research ideas using LLMs poses a significant obstacle to understanding and assessing their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02429v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02429v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have transformed how people interact with artificial intelligence (AI) systems, achieving state-of-the-art results in various tasks, including scientific discovery and hypothesis generation. However, the lack of a comprehensive and systematic evaluation framework for generating research ideas using LLMs poses a significant obstacle to understanding and assessing their generative capabilities in scientific discovery. To address this gap, we propose IdeaBench, a benchmark system that includes a comprehensive dataset and an evaluation framework for standardizing the assessment of research idea generation using LLMs. Our dataset comprises titles and abstracts from a diverse range of influential papers, along with their referenced works. To emulate the human process of generating research ideas, we profile LLMs as domain-specific researchers and ground them in the same context considered by human researchers. This maximizes the utilization of the LLMs' parametric knowledge to dynamically generate new research ideas. We also introduce an evaluation framework for assessing the quality of generated research ideas. Our evaluation framework is a two-stage process: first, using GPT-4o to rank ideas based on user-specified quality indicators such as novelty and feasibility, enabling scalable personalization; and second, calculating relative ranking based "Insight Score" to quantify the chosen quality indicator. The proposed benchmark system will be a valuable asset for the community to measure and compare different LLMs, ultimately advancing the automation of the scientific discovery process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02429v1-abstract-full').style.display = 'none'; document.getElementById('2411.02429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02382">arXiv:2411.02382</a> <span> [<a href="https://arxiv.org/pdf/2411.02382">pdf</a>, <a href="https://arxiv.org/format/2411.02382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Scientific Hypothesis Generation with Knowledge Grounded Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guangzhi Xiong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+E">Eric Xie</a>, <a href="/search/cs?searchtype=author&query=Shariatmadari%2C+A+H">Amir Hassan Shariatmadari</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Sikun Guo</a>, <a href="/search/cs?searchtype=author&query=Bekiranov%2C+S">Stefan Bekiranov</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aidong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02382v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable capabilities in various scientific domains, from natural language processing to complex problem-solving tasks. Their ability to understand and generate human-like text has opened up new possibilities for advancing scientific research, enabling tasks such as data analysis, literature review, and even experimental design. One of the most prom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02382v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02382v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable capabilities in various scientific domains, from natural language processing to complex problem-solving tasks. Their ability to understand and generate human-like text has opened up new possibilities for advancing scientific research, enabling tasks such as data analysis, literature review, and even experimental design. One of the most promising applications of LLMs in this context is hypothesis generation, where they can identify novel research directions by analyzing existing knowledge. However, despite their potential, LLMs are prone to generating ``hallucinations'', outputs that are plausible-sounding but factually incorrect. Such a problem presents significant challenges in scientific fields that demand rigorous accuracy and verifiability, potentially leading to erroneous or misleading conclusions. To overcome these challenges, we propose KG-CoI (Knowledge Grounded Chain of Ideas), a novel system that enhances LLM hypothesis generation by integrating external, structured knowledge from knowledge graphs (KGs). KG-CoI guides LLMs through a structured reasoning process, organizing their output as a chain of ideas (CoI), and includes a KG-supported module for the detection of hallucinations. With experiments on our newly constructed hypothesis generation dataset, we demonstrate that KG-CoI not only improves the accuracy of LLM-generated hypotheses but also reduces the hallucination in their reasoning chains, highlighting its effectiveness in advancing real-world scientific research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02382v1-abstract-full').style.display = 'none'; document.getElementById('2411.02382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02310">arXiv:2411.02310</a> <span> [<a href="https://arxiv.org/pdf/2411.02310">pdf</a>, <a href="https://arxiv.org/format/2411.02310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MdEval: Massively Multilingual Code Debugging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shukai Liu</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+L">Linzheng Chai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiajun Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">He Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liran Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+K">Ke Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hualei Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tao Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yunlong Duan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yu Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liqun Yang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+G">Guanglin Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhoujun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02310v1-abstract-short" style="display: inline;"> Code large language models (LLMs) have made significant progress in code debugging by directly generating the correct code based on the buggy code snippet. Programming benchmarks, typically consisting of buggy code snippet and their associated test cases, are used to assess the debugging capabilities of LLMs. However, many existing benchmarks primarily focus on Python and are often limited in term… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02310v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02310v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02310v1-abstract-full" style="display: none;"> Code large language models (LLMs) have made significant progress in code debugging by directly generating the correct code based on the buggy code snippet. Programming benchmarks, typically consisting of buggy code snippet and their associated test cases, are used to assess the debugging capabilities of LLMs. However, many existing benchmarks primarily focus on Python and are often limited in terms of language diversity (e.g., DebugBench and DebugEval). To advance the field of multilingual debugging with LLMs, we propose the first massively multilingual debugging benchmark, which includes 3.6K test samples of 18 programming languages and covers the automated program repair (APR) task, the code review (CR) task, and the bug identification (BI) task. Further, we introduce the debugging instruction corpora MDEVAL-INSTRUCT by injecting bugs into the correct multilingual queries and solutions (xDebugGen). Further, a multilingual debugger xDebugCoder trained on MDEVAL-INSTRUCT as a strong baseline specifically to handle the bugs of a wide range of programming languages (e.g. "Missing Mut" in language Rust and "Misused Macro Definition" in language C). Our extensive experiments on MDEVAL reveal a notable performance gap between open-source models and closed-source LLMs (e.g., GPT and Claude series), highlighting huge room for improvement in multilingual code debugging scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02310v1-abstract-full').style.display = 'none'; document.getElementById('2411.02310v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00823">arXiv:2411.00823</a> <span> [<a href="https://arxiv.org/pdf/2411.00823">pdf</a>, <a href="https://arxiv.org/format/2411.00823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Mobility-LLM: Learning Visiting Intentions and Travel Preferences from Human Mobility Data with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+L">Letian Gong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiwen Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xuedi Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00823v1-abstract-short" style="display: inline;"> Location-based services (LBS) have accumulated extensive human mobility data on diverse behaviors through check-in sequences. These sequences offer valuable insights into users' intentions and preferences. Yet, existing models analyzing check-in sequences fail to consider the semantics contained in these sequences, which closely reflect human visiting intentions and travel preferences, leading to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00823v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00823v1-abstract-full" style="display: none;"> Location-based services (LBS) have accumulated extensive human mobility data on diverse behaviors through check-in sequences. These sequences offer valuable insights into users' intentions and preferences. Yet, existing models analyzing check-in sequences fail to consider the semantics contained in these sequences, which closely reflect human visiting intentions and travel preferences, leading to an incomplete comprehension. Drawing inspiration from the exceptional semantic understanding and contextual information processing capabilities of large language models (LLMs) across various domains, we present Mobility-LLM, a novel framework that leverages LLMs to analyze check-in sequences for multiple tasks. Since LLMs cannot directly interpret check-ins, we reprogram these sequences to help LLMs comprehensively understand the semantics of human visiting intentions and travel preferences. Specifically, we introduce a visiting intention memory network (VIMN) to capture the visiting intentions at each record, along with a shared pool of human travel preference prompts (HTPP) to guide the LLM in understanding users' travel preferences. These components enhance the model's ability to extract and leverage semantic information from human mobility data effectively. Extensive experiments on four benchmark datasets and three downstream tasks demonstrate that our approach significantly outperforms existing models, underscoring the effectiveness of Mobility-LLM in advancing our understanding of human mobility data within LBS contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00823v1-abstract-full').style.display = 'none'; document.getElementById('2411.00823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24023">arXiv:2410.24023</a> <span> [<a href="https://arxiv.org/pdf/2410.24023">pdf</a>, <a href="https://arxiv.org/format/2410.24023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Approximate attention with MLP: a pruning strategy for attention-based model in multivariate time series forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Suhan Guo</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiahong Deng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yi Wei</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+H">Hui Dou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Furao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jian Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24023v1-abstract-short" style="display: inline;"> Attention-based architectures have become ubiquitous in time series forecasting tasks, including spatio-temporal (STF) and long-term time series forecasting (LTSF). Yet, our understanding of the reasons for their effectiveness remains limited. This work proposes a new way to understand self-attention networks: we have shown empirically that the entire attention mechanism in the encoder can be redu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24023v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24023v1-abstract-full" style="display: none;"> Attention-based architectures have become ubiquitous in time series forecasting tasks, including spatio-temporal (STF) and long-term time series forecasting (LTSF). Yet, our understanding of the reasons for their effectiveness remains limited. This work proposes a new way to understand self-attention networks: we have shown empirically that the entire attention mechanism in the encoder can be reduced to an MLP formed by feedforward, skip-connection, and layer normalization operations for temporal and/or spatial modeling in multivariate time series forecasting. Specifically, the Q, K, and V projection, the attention score calculation, the dot-product between the attention score and the V, and the final projection can be removed from the attention-based networks without significantly degrading the performance that the given network remains the top-tier compared to other SOTA methods. For spatio-temporal networks, the MLP-replace-attention network achieves a reduction in FLOPS of $62.579\%$ with a loss in performance less than $2.5\%$; for LTSF, a reduction in FLOPs of $42.233\%$ with a loss in performance less than $2\%$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24023v1-abstract-full').style.display = 'none'; document.getElementById('2410.24023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22938">arXiv:2410.22938</a> <span> [<a href="https://arxiv.org/pdf/2410.22938">pdf</a>, <a href="https://arxiv.org/format/2410.22938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffLight: A Partial Rewards Conditioned Diffusion Model for Traffic Signal Control with Missing Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyang Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yang Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xiaowei Mao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22938v2-abstract-short" style="display: inline;"> The application of reinforcement learning in traffic signal control (TSC) has been extensively researched and yielded notable achievements. However, most existing works for TSC assume that traffic data from all surrounding intersections is fully and continuously available through sensors. In real-world applications, this assumption often fails due to sensor malfunctions or data loss, making TSC wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22938v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22938v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22938v2-abstract-full" style="display: none;"> The application of reinforcement learning in traffic signal control (TSC) has been extensively researched and yielded notable achievements. However, most existing works for TSC assume that traffic data from all surrounding intersections is fully and continuously available through sensors. In real-world applications, this assumption often fails due to sensor malfunctions or data loss, making TSC with missing data a critical challenge. To meet the needs of practical applications, we introduce DiffLight, a novel conditional diffusion model for TSC under data-missing scenarios in the offline setting. Specifically, we integrate two essential sub-tasks, i.e., traffic data imputation and decision-making, by leveraging a Partial Rewards Conditioned Diffusion (PRCD) model to prevent missing rewards from interfering with the learning process. Meanwhile, to effectively capture the spatial-temporal dependencies among intersections, we design a Spatial-Temporal transFormer (STFormer) architecture. In addition, we propose a Diffusion Communication Mechanism (DCM) to promote better communication and control performance under data-missing scenarios. Extensive experiments on five datasets with various data-missing scenarios demonstrate that DiffLight is an effective controller to address TSC with missing data. The code of DiffLight is released at https://github.com/lokol5579/DiffLight-release. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22938v2-abstract-full').style.display = 'none'; document.getElementById('2410.22938v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21474">arXiv:2410.21474</a> <span> [<a href="https://arxiv.org/pdf/2410.21474">pdf</a>, <a href="https://arxiv.org/format/2410.21474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Estimating Causal Effects of Text Interventions Leveraging LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Siyi Guo</a>, <a href="/search/cs?searchtype=author&query=Marmarelis%2C+M+G">Myrl G. Marmarelis</a>, <a href="/search/cs?searchtype=author&query=Morstatter%2C+F">Fred Morstatter</a>, <a href="/search/cs?searchtype=author&query=Lerman%2C+K">Kristina Lerman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21474v1-abstract-short" style="display: inline;"> Quantifying the effect of textual interventions in social systems, such as reducing anger in social media posts to see its impact on engagement, poses significant challenges. Direct interventions on real-world systems are often infeasible, necessitating reliance on observational data. Traditional causal inference methods, typically designed for binary or discrete treatments, are inadequate for han… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21474v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21474v1-abstract-full" style="display: none;"> Quantifying the effect of textual interventions in social systems, such as reducing anger in social media posts to see its impact on engagement, poses significant challenges. Direct interventions on real-world systems are often infeasible, necessitating reliance on observational data. Traditional causal inference methods, typically designed for binary or discrete treatments, are inadequate for handling the complex, high-dimensional nature of textual data. This paper addresses these challenges by proposing a novel approach, CausalDANN, to estimate causal effects using text transformations facilitated by large language models (LLMs). Unlike existing methods, our approach accommodates arbitrary textual interventions and leverages text-level classifiers with domain adaptation ability to produce robust effect estimates against domain shifts, even when only the control group is observed. This flexibility in handling various text interventions is a key advancement in causal estimation for textual data, offering opportunities to better understand human behaviors and develop effective policies within social systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21474v1-abstract-full').style.display = 'none'; document.getElementById('2410.21474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21287">arXiv:2410.21287</a> <span> [<a href="https://arxiv.org/pdf/2410.21287">pdf</a>, <a href="https://arxiv.org/format/2410.21287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Assessment of OpenAI o1-Preview for Higher Order Thinking in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Latif%2C+E">Ehsan Latif</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yizhu Gao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lehong Shi</a>, <a href="/search/cs?searchtype=author&query=Nayaaba%2C+M">Matthew Nayaaba</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeonggeon Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bewersdorff%2C+A">Arne Bewersdorff</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Luyang Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiantong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoran Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxi Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jichao Yu</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Weihang You</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+V+S">Vincent Shung Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jin Lu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+F">Fei Dou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Ping Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21287v1-abstract-short" style="display: inline;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21287v1-abstract-full" style="display: none;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacognition, data literacy, creative thinking, abstract reasoning, quantitative reasoning, logical reasoning, analogical reasoning, and scientific reasoning. We used validated instruments like the Ennis-Weir Critical Thinking Essay Test and the Biological Systems Thinking Test to compare the o1-preview's performance with human performance systematically. Our findings reveal that o1-preview outperforms humans in most categories, achieving 150% better results in systems thinking, computational thinking, data literacy, creative thinking, scientific reasoning, and abstract reasoning. However, compared to humans, it underperforms by around 25% in logical reasoning, critical thinking, and quantitative reasoning. In analogical reasoning, both o1-preview and humans achieved perfect scores. Despite these strengths, the o1-preview shows limitations in abstract reasoning, where human psychology students outperform it, highlighting the continued importance of human oversight in tasks requiring high-level abstraction. These results have significant educational implications, suggesting a shift toward developing human skills that complement AI, such as creativity, abstract reasoning, and critical thinking. This study emphasizes the transformative potential of AI in education and calls for a recalibration of educational goals, teaching methods, and curricula to align with an AI-driven world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'none'; document.getElementById('2410.21287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An assessment of OpenAI o1-Preview for Higher Order Thinking in Education</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20253">arXiv:2410.20253</a> <span> [<a href="https://arxiv.org/pdf/2410.20253">pdf</a>, <a href="https://arxiv.org/format/2410.20253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Application of an ANN and LSTM-based Ensemble Model for Stock Market Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fang Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shaobo Guo</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Q">Qianwen Xing</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+X">Xinye Sha</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yuhui Jin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qi Zheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20253v3-abstract-short" style="display: inline;"> Stock trading has always been a key economic indicator in modern society and a primary source of profit for financial giants such as investment banks, quantitative trading firms, and hedge funds. Discovering the underlying patterns within the seemingly volatile yet intrinsically structured economic activities has become a central focus of research for many companies. Our study leverages widely-use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20253v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20253v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20253v3-abstract-full" style="display: none;"> Stock trading has always been a key economic indicator in modern society and a primary source of profit for financial giants such as investment banks, quantitative trading firms, and hedge funds. Discovering the underlying patterns within the seemingly volatile yet intrinsically structured economic activities has become a central focus of research for many companies. Our study leverages widely-used modern financial forecasting algorithms, including LSTM, ANN, CNN, and BiLSTM. We begin by comparing the predictive performance of these well-known algorithms on our stock market data, utilizing metrics such as R2, MAE, MSE, RMSE for detailed evaluation. Based on the performance of these models, we then aim to combine their strengths while mitigating their weaknesses, striving to construct a powerful hybrid model that overcomes the performance limitations of individual models.Through rigorous experimentation and exploration, we ultimately developed an LSTM+ANN model that breaks through prior performance bottlenecks, achieving promising and exciting results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20253v3-abstract-full').style.display = 'none'; document.getElementById('2410.20253v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by ICISCAE 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> AE094 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15285">arXiv:2410.15285</a> <span> [<a href="https://arxiv.org/pdf/2410.15285">pdf</a>, <a href="https://arxiv.org/format/2410.15285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Contextual Augmented Multi-Model Programming (CAMP): A Hybrid Local-Cloud Copilot Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuchen Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shangxin Guo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15285v1-abstract-short" style="display: inline;"> The advancements in cloud-based Large Languages Models (LLMs) have revolutionized AI-assisted programming. However, their integration into certain local development environments like ones within the Apple software ecosystem (e.g., iOS apps, macOS) remains challenging due to computational demands and sandboxed constraints. This paper presents CAMP, a multi-model AI-assisted programming framework th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15285v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15285v1-abstract-full" style="display: none;"> The advancements in cloud-based Large Languages Models (LLMs) have revolutionized AI-assisted programming. However, their integration into certain local development environments like ones within the Apple software ecosystem (e.g., iOS apps, macOS) remains challenging due to computational demands and sandboxed constraints. This paper presents CAMP, a multi-model AI-assisted programming framework that consists of a local model that employs Retrieval-Augmented Generation (RAG) to retrieve contextual information from the codebase to facilitate context-aware prompt construction thus optimizing the performance of the cloud model, empowering LLMs' capabilities in local Integrated Development Environments (IDEs). The methodology is actualized in Copilot for Xcode, an AI-assisted programming tool crafted for Xcode that employs the RAG module to address software constraints and enables diverse generative programming tasks, including automatic code completion, documentation, error detection, and intelligent user-agent interaction. The results from objective experiments on generated code quality and subjective experiments on user adoption collectively demonstrate the pilot success of the proposed system and mark its significant contributions to the realm of AI-assisted programming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15285v1-abstract-full').style.display = 'none'; document.getElementById('2410.15285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14281">arXiv:2410.14281</a> <span> [<a href="https://arxiv.org/pdf/2410.14281">pdf</a>, <a href="https://arxiv.org/format/2410.14281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PTR: A Pre-trained Language Model for Trajectory Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tonglong Wei</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yan Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jilin Hu</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+G">Gao Cong</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14281v1-abstract-short" style="display: inline;"> Spatiotemporal trajectory data is vital for web-of-things services and is extensively collected and analyzed by web-based hardware and platforms. However, issues such as service interruptions and network instability often lead to sparsely recorded trajectories, resulting in a loss of detailed movement data. As a result, recovering these trajectories to restore missing information becomes essential… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14281v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14281v1-abstract-full" style="display: none;"> Spatiotemporal trajectory data is vital for web-of-things services and is extensively collected and analyzed by web-based hardware and platforms. However, issues such as service interruptions and network instability often lead to sparsely recorded trajectories, resulting in a loss of detailed movement data. As a result, recovering these trajectories to restore missing information becomes essential. Despite progress, several challenges remain unresolved. First, the lack of large-scale dense trajectory data hampers the performance of existing deep learning methods, which rely heavily on abundant data for supervised training. Second, current methods struggle to generalize across sparse trajectories with varying sampling intervals, necessitating separate re-training for each interval and increasing computational costs. Third, external factors crucial for the recovery of missing points are not fully incorporated. To address these challenges, we propose a framework called PTR. This framework mitigates the issue of limited dense trajectory data by leveraging the capabilities of pre-trained language models (PLMs). PTR incorporates an explicit trajectory prompt and is trained on datasets with multiple sampling intervals, enabling it to generalize effectively across different intervals in sparse trajectories. To capture external factors, we introduce an implicit trajectory prompt that models road conditions, providing richer information for recovering missing points. Additionally, we present a trajectory embedder that encodes trajectory points and transforms the embeddings of both observed and missing points into a format comprehensible to PLMs. Experimental results on two public trajectory datasets with three sampling intervals demonstrate the efficacy and scalability of PTR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14281v1-abstract-full').style.display = 'none'; document.getElementById('2410.14281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11008">arXiv:2410.11008</a> <span> [<a href="https://arxiv.org/pdf/2410.11008">pdf</a>, <a href="https://arxiv.org/format/2410.11008">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> V2I-Calib++: A Multi-terminal Spatial Calibration Approach in Urban Intersections for Collaborative Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qianxin Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yijin Xiong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shichun Guo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Ziqiang Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11008v1-abstract-short" style="display: inline;"> Urban intersections, dense with pedestrian and vehicular traffic and compounded by GPS signal obstructions from high-rise buildings, are among the most challenging areas in urban traffic systems. Traditional single-vehicle intelligence systems often perform poorly in such environments due to a lack of global traffic flow information and the ability to respond to unexpected events. Vehicle-to-Every… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11008v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11008v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11008v1-abstract-full" style="display: none;"> Urban intersections, dense with pedestrian and vehicular traffic and compounded by GPS signal obstructions from high-rise buildings, are among the most challenging areas in urban traffic systems. Traditional single-vehicle intelligence systems often perform poorly in such environments due to a lack of global traffic flow information and the ability to respond to unexpected events. Vehicle-to-Everything (V2X) technology, through real-time communication between vehicles (V2V) and vehicles to infrastructure (V2I), offers a robust solution. However, practical applications still face numerous challenges. Calibration among heterogeneous vehicle and infrastructure endpoints in multi-end LiDAR systems is crucial for ensuring the accuracy and consistency of perception system data. Most existing multi-end calibration methods rely on initial calibration values provided by positioning systems, but the instability of GPS signals due to high buildings in urban canyons poses severe challenges to these methods. To address this issue, this paper proposes a novel multi-end LiDAR system calibration method that does not require positioning priors to determine initial external parameters and meets real-time requirements. Our method introduces an innovative multi-end perception object association technique, utilizing a new Overall Distance metric (oDist) to measure the spatial association between perception objects, and effectively combines global consistency search algorithms with optimal transport theory. By this means, we can extract co-observed targets from object association results for further external parameter computation and optimization. Extensive comparative and ablation experiments conducted on the simulated dataset V2X-Sim and the real dataset DAIR-V2X confirm the effectiveness and efficiency of our method. The code for this method can be accessed at: \url{https://github.com/MassimoQu/v2i-calib}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11008v1-abstract-full').style.display = 'none'; document.getElementById('2410.11008v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06555">arXiv:2410.06555</a> <span> [<a href="https://arxiv.org/pdf/2410.06555">pdf</a>, <a href="https://arxiv.org/format/2410.06555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ING-VP: MLLMs cannot Play Easy Vision-based Games Yet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoran Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06555v1-abstract-short" style="display: inline;"> As multimodal large language models (MLLMs) continue to demonstrate increasingly competitive performance across a broad spectrum of tasks, more intricate and comprehensive benchmarks have been developed to assess these cutting-edge models. These benchmarks introduce new challenges to core capabilities such as perception, reasoning, and planning. However, existing multimodal benchmarks fall short i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06555v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06555v1-abstract-full" style="display: none;"> As multimodal large language models (MLLMs) continue to demonstrate increasingly competitive performance across a broad spectrum of tasks, more intricate and comprehensive benchmarks have been developed to assess these cutting-edge models. These benchmarks introduce new challenges to core capabilities such as perception, reasoning, and planning. However, existing multimodal benchmarks fall short in providing a focused evaluation of multi-step planning based on spatial relationships in images. To bridge this gap, we present ING-VP, the first INteractive Game-based Vision Planning benchmark, specifically designed to evaluate the spatial imagination and multi-step reasoning abilities of MLLMs. ING-VP features 6 distinct games, encompassing 300 levels, each with 6 unique configurations. A single model engages in over 60,000 rounds of interaction. The benchmark framework allows for multiple comparison settings, including image-text vs. text-only inputs, single-step vs. multi-step reasoning, and with-history vs. without-history conditions, offering valuable insights into the model's capabilities. We evaluated numerous state-of-the-art MLLMs, with the highest-performing model, Claude-3.5 Sonnet, achieving an average accuracy of only 3.37%, far below the anticipated standard. This work aims to provide a specialized evaluation framework to drive advancements in MLLMs' capacity for complex spatial reasoning and planning. The code is publicly available at https://github.com/Thisisus7/ING-VP.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06555v1-abstract-full').style.display = 'none'; document.getElementById('2410.06555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">49 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06194">arXiv:2410.06194</a> <span> [<a href="https://arxiv.org/pdf/2410.06194">pdf</a>, <a href="https://arxiv.org/format/2410.06194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompting DirectSAM for Semantic Contour Extraction in Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+S">Shiyu Miao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Delong Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yanhui Gu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengjie Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06194v1-abstract-short" style="display: inline;"> The Direct Segment Anything Model (DirectSAM) excels in class-agnostic contour extraction. In this paper, we explore its use by applying it to optical remote sensing imagery, where semantic contour extraction-such as identifying buildings, road networks, and coastlines-holds significant practical value. Those applications are currently handled via training specialized small models separately on sm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06194v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06194v1-abstract-full" style="display: none;"> The Direct Segment Anything Model (DirectSAM) excels in class-agnostic contour extraction. In this paper, we explore its use by applying it to optical remote sensing imagery, where semantic contour extraction-such as identifying buildings, road networks, and coastlines-holds significant practical value. Those applications are currently handled via training specialized small models separately on small datasets in each domain. We introduce a foundation model derived from DirectSAM, termed DirectSAM-RS, which not only inherits the strong segmentation capability acquired from natural images, but also benefits from a large-scale dataset we created for remote sensing semantic contour extraction. This dataset comprises over 34k image-text-contour triplets, making it at least 30 times larger than individual dataset. DirectSAM-RS integrates a prompter module: a text encoder and cross-attention layers attached to the DirectSAM architecture, which allows flexible conditioning on target class labels or referring expressions. We evaluate the DirectSAM-RS in both zero-shot and fine-tuning setting, and demonstrate that it achieves state-of-the-art performance across several downstream benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06194v1-abstract-full').style.display = 'none'; document.getElementById('2410.06194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06149">arXiv:2410.06149</a> <span> [<a href="https://arxiv.org/pdf/2410.06149">pdf</a>, <a href="https://arxiv.org/format/2410.06149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3611851">10.1145/3581783.3611851 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Toward Scalable Image Feature Compression: A Content-Adaptive and Diffusion-Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Sha Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ning Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaotong Li</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lingyu Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06149v1-abstract-short" style="display: inline;"> Traditional image codecs emphasize signal fidelity and human perception, often at the expense of machine vision tasks. Deep learning methods have demonstrated promising coding performance by utilizing rich semantic embeddings optimized for both human and machine vision. However, these compact embeddings struggle to capture fine details such as contours and textures, resulting in imperfect reconstr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06149v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06149v1-abstract-full" style="display: none;"> Traditional image codecs emphasize signal fidelity and human perception, often at the expense of machine vision tasks. Deep learning methods have demonstrated promising coding performance by utilizing rich semantic embeddings optimized for both human and machine vision. However, these compact embeddings struggle to capture fine details such as contours and textures, resulting in imperfect reconstructions. Furthermore, existing learning-based codecs lack scalability. To address these limitations, this paper introduces a content-adaptive diffusion model for scalable image compression. The proposed method encodes fine textures through a diffusion process, enhancing perceptual quality while preserving essential features for machine vision tasks. The approach employs a Markov palette diffusion model combined with widely used feature extractors and image generators, enabling efficient data compression. By leveraging collaborative texture-semantic feature extraction and pseudo-label generation, the method accurately captures texture information. A content-adaptive Markov palette diffusion model is then applied to represent both low-level textures and high-level semantic content in a scalable manner. This framework offers flexible control over compression ratios by selecting intermediate diffusion states, eliminating the need for retraining deep learning models at different operating points. Extensive experiments demonstrate the effectiveness of the proposed framework in both image reconstruction and downstream machine vision tasks such as object detection, segmentation, and facial landmark detection, achieving superior perceptual quality compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06149v1-abstract-full').style.display = 'none'; document.getElementById('2410.06149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in Proceedings of the 31st ACM International Conference on Multimedia, pp. 1431-1442, 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01363">arXiv:2410.01363</a> <span> [<a href="https://arxiv.org/pdf/2410.01363">pdf</a>, <a href="https://arxiv.org/format/2410.01363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PCQPR: Proactive Conversational Question Planning with Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shasha Guo</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lizi Liao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cuiping Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01363v1-abstract-short" style="display: inline;"> Conversational Question Generation (CQG) enhances the interactivity of conversational question-answering systems in fields such as education, customer service, and entertainment. However, traditional CQG, focusing primarily on the immediate context, lacks the conversational foresight necessary to guide conversations toward specified conclusions. This limitation significantly restricts their abilit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01363v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01363v1-abstract-full" style="display: none;"> Conversational Question Generation (CQG) enhances the interactivity of conversational question-answering systems in fields such as education, customer service, and entertainment. However, traditional CQG, focusing primarily on the immediate context, lacks the conversational foresight necessary to guide conversations toward specified conclusions. This limitation significantly restricts their ability to achieve conclusion-oriented conversational outcomes. In this work, we redefine the CQG task as Conclusion-driven Conversational Question Generation (CCQG) by focusing on proactivity, not merely reacting to the unfolding conversation but actively steering it towards a conclusion-oriented question-answer pair. To address this, we propose a novel approach, called Proactive Conversational Question Planning with self-Refining (PCQPR). Concretely, by integrating a planning algorithm inspired by Monte Carlo Tree Search (MCTS) with the analytical capabilities of large language models (LLMs), PCQPR predicts future conversation turns and continuously refines its questioning strategies. This iterative self-refining mechanism ensures the generation of contextually relevant questions strategically devised to reach a specified outcome. Our extensive evaluations demonstrate that PCQPR significantly surpasses existing CQG methods, marking a paradigm shift towards conclusion-oriented conversational question-answering systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01363v1-abstract-full').style.display = 'none'; document.getElementById('2410.01363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Main</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00753">arXiv:2410.00753</a> <span> [<a href="https://arxiv.org/pdf/2410.00753">pdf</a>, <a href="https://arxiv.org/format/2410.00753">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Drug Delivery in Smart Pharmacies: A Novel Framework of Multi-Stage Grasping Network Combined with Adaptive Robotics Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+R">Rui Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shirong Guo</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yuhang Qiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honghui Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lujin Huang</a>, <a href="/search/cs?searchtype=author&query=Yong%2C+M">Ming Yong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linfu Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Liquan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00753v1-abstract-short" style="display: inline;"> Robots-based smart pharmacies are essential for modern healthcare systems, enabling efficient drug delivery. However, a critical challenge exists in the robotic handling of drugs with varying shapes and overlapping positions, which previous studies have not adequately addressed. To enhance the robotic arm's ability to grasp chaotic, overlapping, and variously shaped drugs, this paper proposed a no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00753v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00753v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00753v1-abstract-full" style="display: none;"> Robots-based smart pharmacies are essential for modern healthcare systems, enabling efficient drug delivery. However, a critical challenge exists in the robotic handling of drugs with varying shapes and overlapping positions, which previous studies have not adequately addressed. To enhance the robotic arm's ability to grasp chaotic, overlapping, and variously shaped drugs, this paper proposed a novel framework combining a multi-stage grasping network with an adaptive robotics mechanism. The framework first preprocessed images using an improved Super-Resolution Convolutional Neural Network (SRCNN) algorithm, and then employed the proposed YOLOv5+E-A-SPPFCSPC+BIFPNC (YOLO-EASB) instance segmentation algorithm for precise drug segmentation. The most suitable drugs for grasping can be determined by assessing the completeness of the segmentation masks. Then, these segmented drugs were processed by our improved Adaptive Feature Fusion and Grasp-Aware Network (IAFFGA-Net) with the optimized loss function, which ensures accurate picking actions even in complex environments. To control the robot grasping, a time-optimal robotic arm trajectory planning algorithm that combines an improved ant colony algorithm with 3-5-3 interpolation was developed, further improving efficiency while ensuring smooth trajectories. Finally, this system was implemented and validated within an adaptive collaborative robot setup, which dynamically adjusts to different production environments and task requirements. Experimental results demonstrate the superiority of our multi-stage grasping network in optimizing smart pharmacy operations, while also showcasing its remarkable adaptability and effectiveness in practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00753v1-abstract-full').style.display = 'none'; document.getElementById('2410.00753v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19838">arXiv:2409.19838</a> <span> [<a href="https://arxiv.org/pdf/2409.19838">pdf</a>, <a href="https://arxiv.org/format/2409.19838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> geom2vec: pretrained GNNs as geometric featurizers for conformational dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pengmei%2C+Z">Zihan Pengmei</a>, <a href="/search/cs?searchtype=author&query=Lorpaiboon%2C+C">Chatipat Lorpaiboon</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S+C">Spencer C. Guo</a>, <a href="/search/cs?searchtype=author&query=Weare%2C+J">Jonathan Weare</a>, <a href="/search/cs?searchtype=author&query=Dinner%2C+A+R">Aaron R. Dinner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19838v1-abstract-short" style="display: inline;"> Identifying informative low-dimensional features that characterize dynamics in molecular simulations remains a challenge, often requiring extensive hand-tuning and system-specific knowledge. Here, we introduce geom2vec, in which pretrained graph neural networks (GNNs) are used as universal geometric featurizers. By pretraining equivariant GNNs on a large dataset of molecular conformations with a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19838v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19838v1-abstract-full" style="display: none;"> Identifying informative low-dimensional features that characterize dynamics in molecular simulations remains a challenge, often requiring extensive hand-tuning and system-specific knowledge. Here, we introduce geom2vec, in which pretrained graph neural networks (GNNs) are used as universal geometric featurizers. By pretraining equivariant GNNs on a large dataset of molecular conformations with a self-supervised denoising objective, we learn transferable structural representations that capture molecular geometric patterns without further fine-tuning. We show that the learned representations can be directly used to analyze trajectory data, thus eliminating the need for manual feature selection and improving robustness of the simulation analysis workflows. Importantly, by decoupling GNN training from training for downstream tasks, we enable analysis of larger molecular graphs with limited computational resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19838v1-abstract-full').style.display = 'none'; document.getElementById('2409.19838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures, supporting information appended</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16537">arXiv:2409.16537</a> <span> [<a href="https://arxiv.org/pdf/2409.16537">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A QoE-Aware Split Inference Accelerating Algorithm for NOMA-based Edge Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Ning Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Quan Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenchao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16537v1-abstract-short" style="display: inline;"> Even the AI has been widely used and significantly changed our life, deploying the large AI models on resource limited edge devices directly is not appropriate. Thus, the model split inference is proposed to improve the performance of edge intelligence, in which the AI model is divided into different sub models and the resource-intensive sub model is offloaded to edge server wirelessly for reducin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16537v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16537v1-abstract-full" style="display: none;"> Even the AI has been widely used and significantly changed our life, deploying the large AI models on resource limited edge devices directly is not appropriate. Thus, the model split inference is proposed to improve the performance of edge intelligence, in which the AI model is divided into different sub models and the resource-intensive sub model is offloaded to edge server wirelessly for reducing resource requirements and inference latency. However, the previous works mainly concentrate on improving and optimizing the system QoS, ignore the effect of QoE which is another critical item for the users except for QoS. Even the QoE has been widely learned in EC, considering the differences between task offloading in EC and split inference in EI, and the specific issues in QoE which are still not addressed in EC and EI, these algorithms cannot work effectively in edge split inference scenarios. Thus, an effective resource allocation algorithm is proposed in this paper, for accelerating split inference in EI and achieving the tradeoff between inference delay, QoE, and resource consumption, abbreviated as ERA. Specifically, the ERA takes the resource consumption, QoE, and inference latency into account to find the optimal model split strategy and resource allocation strategy. Since the minimum inference delay and resource consumption, and maximum QoE cannot be satisfied simultaneously, the gradient descent based algorithm is adopted to find the optimal tradeoff between them. Moreover, the loop iteration GD approach is developed to reduce the complexity of the GD algorithm caused by parameter discretization. Additionally, the properties of the proposed algorithms are investigated, including convergence, complexity, and approximation error. The experimental results demonstrate that the performance of ERA is much better than that of the previous studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16537v1-abstract-full').style.display = 'none'; document.getElementById('2409.16537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16pages, 19figures. arXiv admin note: substantial text overlap with arXiv:2312.15850</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16149">arXiv:2409.16149</a> <span> [<a href="https://arxiv.org/pdf/2409.16149">pdf</a>, <a href="https://arxiv.org/format/2409.16149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MCTrack: A Unified 3D Multi-Object Tracking Framework for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Shouzheng Qi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jieyou Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hangning Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+K">Kai Tu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Songlin Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jianbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16149v2-abstract-short" style="display: inline;"> This paper introduces MCTrack, a new 3D multi-object tracking method that achieves state-of-the-art (SOTA) performance across KITTI, nuScenes, and Waymo datasets. Addressing the gap in existing tracking paradigms, which often perform well on specific datasets but lack generalizability, MCTrack offers a unified solution. Additionally, we have standardized the format of perceptual results across var… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16149v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16149v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16149v2-abstract-full" style="display: none;"> This paper introduces MCTrack, a new 3D multi-object tracking method that achieves state-of-the-art (SOTA) performance across KITTI, nuScenes, and Waymo datasets. Addressing the gap in existing tracking paradigms, which often perform well on specific datasets but lack generalizability, MCTrack offers a unified solution. Additionally, we have standardized the format of perceptual results across various datasets, termed BaseVersion, facilitating researchers in the field of multi-object tracking (MOT) to concentrate on the core algorithmic development without the undue burden of data preprocessing. Finally, recognizing the limitations of current evaluation metrics, we propose a novel set that assesses motion information output, such as velocity and acceleration, crucial for downstream tasks. The source codes of the proposed method are available at this link: https://github.com/megvii-research/MCTrack}{https://github.com/megvii-research/MCTrack <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16149v2-abstract-full').style.display = 'none'; document.getElementById('2409.16149v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12980">arXiv:2409.12980</a> <span> [<a href="https://arxiv.org/pdf/2409.12980">pdf</a>, <a href="https://arxiv.org/format/2409.12980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A New People-Object Interaction Dataset and NVS Benchmarks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuai Guo</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Houqiang Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuwen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yijie Gao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiajing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Rong Xie</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Li Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12980v1-abstract-short" style="display: inline;"> Recently, NVS in human-object interaction scenes has received increasing attention. Existing human-object interaction datasets mainly consist of static data with limited views, offering only RGB images or videos, mostly containing interactions between a single person and objects. Moreover, these datasets exhibit complexities in lighting environments, poor synchronization, and low resolution, hinde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12980v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12980v1-abstract-full" style="display: none;"> Recently, NVS in human-object interaction scenes has received increasing attention. Existing human-object interaction datasets mainly consist of static data with limited views, offering only RGB images or videos, mostly containing interactions between a single person and objects. Moreover, these datasets exhibit complexities in lighting environments, poor synchronization, and low resolution, hindering high-quality human-object interaction studies. In this paper, we introduce a new people-object interaction dataset that comprises 38 series of 30-view multi-person or single-person RGB-D video sequences, accompanied by camera parameters, foreground masks, SMPL models, some point clouds, and mesh files. Video sequences are captured by 30 Kinect Azures, uniformly surrounding the scene, each in 4K resolution 25 FPS, and lasting for 1$\sim$19 seconds. Meanwhile, we evaluate some SOTA NVS models on our dataset to establish the NVS benchmarks. We hope our work can inspire further research in humanobject interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12980v1-abstract-full').style.display = 'none'; document.getElementById('2409.12980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10899">arXiv:2409.10899</a> <span> [<a href="https://arxiv.org/pdf/2409.10899">pdf</a>, <a href="https://arxiv.org/ps/2409.10899">ps</a>, <a href="https://arxiv.org/format/2409.10899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> Conflict-free chromatic index of trees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shanshan Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E+Y+H">Ethan Y. H. Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Luyi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10899v2-abstract-short" style="display: inline;"> A graph $G$ is conflict-free $k$-edge-colorable if there exists an assignment of $k$ colors to $E(G)$ such that for every edge $e\in E(G)$, there is a color that is assigned to exactly one edge among the closed neighborhood of $e$. The smallest $k$ such that $G$ is conflict-free $k$-edge-colorable is called the conflict-free chromatic index of $G$, denoted $蠂'_{CF}(G)$. D醛bski and Przyby\a{l}o sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10899v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10899v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10899v2-abstract-full" style="display: none;"> A graph $G$ is conflict-free $k$-edge-colorable if there exists an assignment of $k$ colors to $E(G)$ such that for every edge $e\in E(G)$, there is a color that is assigned to exactly one edge among the closed neighborhood of $e$. The smallest $k$ such that $G$ is conflict-free $k$-edge-colorable is called the conflict-free chromatic index of $G$, denoted $蠂'_{CF}(G)$. D醛bski and Przyby\a{l}o showed that $2\le蠂'_{CF}(T)\le 3$ for every tree $T$ of size at least two. In this paper, we present an algorithm to determine the conflict-free chromatic index of a tree without 2-degree vertices, in time $O(|V(T)|)$. This partially answer a question raised by Kamyczura, Meszka and Przyby\a{l}o. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10899v2-abstract-full').style.display = 'none'; document.getElementById('2409.10899v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10713">arXiv:2409.10713</a> <span> [<a href="https://arxiv.org/pdf/2409.10713">pdf</a>, <a href="https://arxiv.org/format/2409.10713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3654777.3676359">10.1145/3654777.3676359 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> "The Data Says Otherwise"-Towards Automated Fact-checking and Communication of Data Claims </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yu Fu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shunan Guo</a>, <a href="/search/cs?searchtype=author&query=Hoffswell%2C+J">Jane Hoffswell</a>, <a href="/search/cs?searchtype=author&query=Bursztyn%2C+V+S">Victor S. Bursztyn</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R">Ryan Rossi</a>, <a href="/search/cs?searchtype=author&query=Stasko%2C+J">John Stasko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10713v1-abstract-short" style="display: inline;"> Fact-checking data claims requires data evidence retrieval and analysis, which can become tedious and intractable when done manually. This work presents Aletheia, an automated fact-checking prototype designed to facilitate data claims verification and enhance data evidence communication. For verification, we utilize a pre-trained LLM to parse the semantics for evidence retrieval. To effectively co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10713v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10713v1-abstract-full" style="display: none;"> Fact-checking data claims requires data evidence retrieval and analysis, which can become tedious and intractable when done manually. This work presents Aletheia, an automated fact-checking prototype designed to facilitate data claims verification and enhance data evidence communication. For verification, we utilize a pre-trained LLM to parse the semantics for evidence retrieval. To effectively communicate the data evidence, we design representations in two forms: data tables and visualizations, tailored to various data fact types. Additionally, we design interactions that showcase a real-world application of these techniques. We evaluate the performance of two core NLP tasks with a curated dataset comprising 400 data claims and compare the two representation forms regarding viewers' assessment time, confidence, and preference via a user study with 20 participants. The evaluation offers insights into the feasibility and bottlenecks of using LLMs for data fact-checking tasks, potential advantages and disadvantages of using visualizations over data tables, and design recommendations for presenting data evidence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10713v1-abstract-full').style.display = 'none'; document.getElementById('2409.10713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 13 figures, UIST 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2; I.7.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09725">arXiv:2409.09725</a> <span> [<a href="https://arxiv.org/pdf/2409.09725">pdf</a>, <a href="https://arxiv.org/format/2409.09725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Precise Pick-and-Place using Score-Based Diffusion Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shih-Wei Guo</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+T">Tsu-Ching Hsiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Lun Liu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chun-Yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09725v1-abstract-short" style="display: inline;"> In this paper, we propose a novel coarse-to-fine continuous pose diffusion method to enhance the precision of pick-and-place operations within robotic manipulation tasks. Leveraging the capabilities of diffusion networks, we facilitate the accurate perception of object poses. This accurate perception enhances both pick-and-place success rates and overall manipulation precision. Our methodology uti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09725v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09725v1-abstract-full" style="display: none;"> In this paper, we propose a novel coarse-to-fine continuous pose diffusion method to enhance the precision of pick-and-place operations within robotic manipulation tasks. Leveraging the capabilities of diffusion networks, we facilitate the accurate perception of object poses. This accurate perception enhances both pick-and-place success rates and overall manipulation precision. Our methodology utilizes a top-down RGB image projected from an RGB-D camera and adopts a coarse-to-fine architecture. This architecture enables efficient learning of coarse and fine models. A distinguishing feature of our approach is its focus on continuous pose estimation, which enables more precise object manipulation, particularly concerning rotational angles. In addition, we employ pose and color augmentation techniques to enable effective training with limited data. Through extensive experiments in simulated and real-world scenarios, as well as an ablation study, we comprehensively evaluate our proposed methodology. Taken together, the findings validate its effectiveness in achieving high-precision pick-and-place tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09725v1-abstract-full').style.display = 'none'; document.getElementById('2409.09725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures. Project webpage: https://tony2guo.github.io/precise-pick-and-place/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07365">arXiv:2409.07365</a> <span> [<a href="https://arxiv.org/pdf/2409.07365">pdf</a>, <a href="https://arxiv.org/format/2409.07365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Event-based Mosaicing Bundle Adjustment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuang Guo</a>, <a href="/search/cs?searchtype=author&query=Gallego%2C+G">Guillermo Gallego</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07365v1-abstract-short" style="display: inline;"> We tackle the problem of mosaicing bundle adjustment (i.e., simultaneous refinement of camera orientations and scene map) for a purely rotating event camera. We formulate the problem as a regularized non-linear least squares optimization. The objective function is defined using the linearized event generation model in the camera orientations and the panoramic gradient map of the scene. We show tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07365v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07365v1-abstract-full" style="display: none;"> We tackle the problem of mosaicing bundle adjustment (i.e., simultaneous refinement of camera orientations and scene map) for a purely rotating event camera. We formulate the problem as a regularized non-linear least squares optimization. The objective function is defined using the linearized event generation model in the camera orientations and the panoramic gradient map of the scene. We show that this BA optimization has an exploitable block-diagonal sparsity structure, so that the problem can be solved efficiently. To the best of our knowledge, this is the first work to leverage such sparsity to speed up the optimization in the context of event-based cameras, without the need to convert events into image-like representations. We evaluate our method, called EMBA, on both synthetic and real-world datasets to show its effectiveness (50% photometric error decrease), yielding results of unprecedented quality. In addition, we demonstrate EMBA using high spatial resolution event cameras, yielding delicate panoramas in the wild, even without an initial map. Project page: https://github.com/tub-rip/emba <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07365v1-abstract-full').style.display = 'none'; document.getElementById('2409.07365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14+11 pages, 11 figures, 10 tables, https://github.com/tub-rip/emba</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> European Conference on Computer Vision (ECCV), Milan, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06666">arXiv:2409.06666</a> <span> [<a href="https://arxiv.org/pdf/2409.06666">pdf</a>, <a href="https://arxiv.org/format/2409.06666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LLaMA-Omni: Seamless Speech Interaction with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Q">Qingkai Fang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shoutao Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhengrui Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaolei Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yang Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06666v1-abstract-short" style="display: inline;"> Models like GPT-4o enable real-time interaction with large language models (LLMs) through speech, significantly enhancing user experience compared to traditional text-based interaction. However, there is still a lack of exploration on how to build speech interaction models based on open-source LLMs. To address this, we propose LLaMA-Omni, a novel model architecture designed for low-latency and hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06666v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06666v1-abstract-full" style="display: none;"> Models like GPT-4o enable real-time interaction with large language models (LLMs) through speech, significantly enhancing user experience compared to traditional text-based interaction. However, there is still a lack of exploration on how to build speech interaction models based on open-source LLMs. To address this, we propose LLaMA-Omni, a novel model architecture designed for low-latency and high-quality speech interaction with LLMs. LLaMA-Omni integrates a pretrained speech encoder, a speech adaptor, an LLM, and a streaming speech decoder. It eliminates the need for speech transcription, and can simultaneously generate text and speech responses directly from speech instructions with extremely low latency. We build our model based on the latest Llama-3.1-8B-Instruct model. To align the model with speech interaction scenarios, we construct a dataset named InstructS2S-200K, which includes 200K speech instructions and corresponding speech responses. Experimental results show that compared to previous speech-language models, LLaMA-Omni provides better responses in both content and style, with a response latency as low as 226ms. Additionally, training LLaMA-Omni takes less than 3 days on just 4 GPUs, paving the way for the efficient development of speech-language models in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06666v1-abstract-full').style.display = 'none'; document.getElementById('2409.06666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Project: https://github.com/ictnlp/LLaMA-Omni</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02728">arXiv:2409.02728</a> <span> [<a href="https://arxiv.org/pdf/2409.02728">pdf</a>, <a href="https://arxiv.org/ps/2409.02728">ps</a>, <a href="https://arxiv.org/format/2409.02728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Task-Oriented Communication for Graph Data: A Graph Information Bottleneck Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shujing Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanhu Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuaishuai Guo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chenyuan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02728v1-abstract-short" style="display: inline;"> Graph data, essential in fields like knowledge representation and social networks, often involves large networks with many nodes and edges. Transmitting these graphs can be highly inefficient due to their size and redundancy for specific tasks. This paper introduces a method to extract a smaller, task-focused subgraph that maintains key information while reducing communication overhead. Our approa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02728v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02728v1-abstract-full" style="display: none;"> Graph data, essential in fields like knowledge representation and social networks, often involves large networks with many nodes and edges. Transmitting these graphs can be highly inefficient due to their size and redundancy for specific tasks. This paper introduces a method to extract a smaller, task-focused subgraph that maintains key information while reducing communication overhead. Our approach utilizes graph neural networks (GNNs) and the graph information bottleneck (GIB) principle to create a compact, informative, and robust graph representation suitable for transmission. The challenge lies in the irregular structure of graph data, making GIB optimization complex. We address this by deriving a tractable variational upper bound for the objective function. Additionally, we propose the VQ-GIB mechanism, integrating vector quantization (VQ) to convert subgraph representations into a discrete codebook sequence, compatible with existing digital communication systems. Our experiments show that this GIB-based method significantly lowers communication costs while preserving essential task-related information. The approach demonstrates robust performance across various communication channels, suitable for both continuous and discrete systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02728v1-abstract-full').style.display = 'none'; document.getElementById('2409.02728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17383">arXiv:2408.17383</a> <span> [<a href="https://arxiv.org/pdf/2408.17383">pdf</a>, <a href="https://arxiv.org/format/2408.17383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MoRe Fine-Tuning with 10x Fewer Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wenxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Roberts%2C+N">Nicholas Roberts</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tzu-Heng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jitian Zhao</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+J">John Cooper</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Samuel Guo</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+C">Chengyu Duan</a>, <a href="/search/cs?searchtype=author&query=Sala%2C+F">Frederic Sala</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17383v1-abstract-short" style="display: inline;"> Parameter-efficient fine-tuning (PEFT) techniques have unlocked the potential to cheaply and easily specialize large pretrained models. However, the most prominent approaches, like low-rank adapters (LoRA), depend on heuristics or rules-of-thumb for their architectural choices -- potentially limiting their performance for new models and architectures. This limitation suggests that techniques from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17383v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17383v1-abstract-full" style="display: none;"> Parameter-efficient fine-tuning (PEFT) techniques have unlocked the potential to cheaply and easily specialize large pretrained models. However, the most prominent approaches, like low-rank adapters (LoRA), depend on heuristics or rules-of-thumb for their architectural choices -- potentially limiting their performance for new models and architectures. This limitation suggests that techniques from neural architecture search could be used to obtain optimal adapter architectures, but these are often expensive and difficult to implement. We address this challenge with Monarch Rectangular Fine-tuning (MoRe), a simple framework to search over adapter architectures that relies on the Monarch matrix class. Theoretically, we show that MoRe is more expressive than LoRA. Empirically, our approach is more parameter-efficient and performant than state-of-the-art PEFTs on a range of tasks and models, with as few as 5\% of LoRA's parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17383v1-abstract-full').style.display = 'none'; document.getElementById('2408.17383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16212">arXiv:2408.16212</a> <span> [<a href="https://arxiv.org/pdf/2408.16212">pdf</a>, <a href="https://arxiv.org/format/2408.16212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Earth and Planetary Astrophysics">astro-ph.EP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Solar and Stellar Astrophysics">astro-ph.SR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Application of Machine Learning in Tidal Evolution Simulation of Star-Planet Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuaishuai Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jianheng Guo</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+K">KaiFan Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+L">Lei Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16212v1-abstract-short" style="display: inline;"> With the release of a large amount of astronomical data, an increasing number of close-in hot Jupiters have been discovered. Calculating their evolutionary curves using star-planet interaction models presents a challenge. To expedite the generation of evolutionary curves for these close-in hot Jupiter systems, we utilized tidal interaction models established on MESA to create 15,745 samples of sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16212v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16212v1-abstract-full" style="display: none;"> With the release of a large amount of astronomical data, an increasing number of close-in hot Jupiters have been discovered. Calculating their evolutionary curves using star-planet interaction models presents a challenge. To expedite the generation of evolutionary curves for these close-in hot Jupiter systems, we utilized tidal interaction models established on MESA to create 15,745 samples of star-planet systems and 7,500 samples of stars. Additionally, we employed a neural network (Multi-Layer Perceptron - MLP) to predict the evolutionary curves of the systems, including stellar effective temperature, radius, stellar rotation period, and planetary orbital period. The median relative errors of the predicted evolutionary curves were found to be 0.15%, 0.43%, 2.61%, and 0.57%, respectively. Furthermore, the speed at which we generate evolutionary curves exceeds that of model-generated curves by more than four orders of magnitude. We also extracted features of planetary migration states and utilized lightGBM to classify the samples into 6 categories for prediction. We found that by combining three types that undergo long-term double synchronization into one label, the classifier effectively recognized these features. Apart from systems experiencing long-term double synchronization, the median relative errors of the predicted evolutionary curves were all below 4%. Our work provides an efficient method to save significant computational resources and time with minimal loss in accuracy. This research also lays the foundation for analyzing the evolutionary characteristics of systems under different migration states, aiding in the understanding of the underlying physical mechanisms of such systems. Finally, to a large extent, our approach could replace the calculations of theoretical models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16212v1-abstract-full').style.display = 'none'; document.getElementById('2408.16212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15251">arXiv:2408.15251</a> <span> [<a href="https://arxiv.org/pdf/2408.15251">pdf</a>, <a href="https://arxiv.org/format/2408.15251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TrajFM: A Vehicle Trajectory Foundation Model for Region and Task Transferability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yan Lin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tonglong Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zeyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Haomin Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jilin Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15251v1-abstract-short" style="display: inline;"> Vehicle trajectories provide valuable movement information that supports various downstream tasks and powers real-world applications. A desirable trajectory learning model should transfer between different regions and tasks without retraining, thus improving computational efficiency and effectiveness with limited training data. However, a model's ability to transfer across regions is limited by th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15251v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15251v1-abstract-full" style="display: none;"> Vehicle trajectories provide valuable movement information that supports various downstream tasks and powers real-world applications. A desirable trajectory learning model should transfer between different regions and tasks without retraining, thus improving computational efficiency and effectiveness with limited training data. However, a model's ability to transfer across regions is limited by the unique spatial features and POI arrangements of each region, which are closely linked to vehicle movement patterns and difficult to generalize. Additionally, achieving task transferability is challenging due to the differing generation schemes required for various tasks. Existing efforts towards transferability primarily involve learning embedding vectors for trajectories, which perform poorly in region transfer and still require retraining of prediction modules for task transfer. To address these challenges, we propose TrajFM, a vehicle trajectory foundation model that excels in both region and task transferability. For region transferability, we introduce STRFormer as the main learnable model within TrajFM. It integrates spatial, temporal, and POI modalities of trajectories to effectively manage variations in POI arrangements across regions and includes a learnable spatio-temporal Rotary position embedding module for handling spatial features. For task transferability, we propose a trajectory masking and recovery scheme. This scheme unifies the generation processes of various tasks into the masking and recovery of modalities and sub-trajectories, allowing TrajFM to be pre-trained once and transferred to different tasks without retraining. Experiments on two real-world vehicle trajectory datasets under various settings demonstrate the effectiveness of TrajFM. Code is available at https://anonymous.4open.science/r/TrajFM-30E4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15251v1-abstract-full').style.display = 'none'; document.getElementById('2408.15251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12809">arXiv:2408.12809</a> <span> [<a href="https://arxiv.org/pdf/2408.12809">pdf</a>, <a href="https://arxiv.org/format/2408.12809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xiaowei Mao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yan Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yubin Chen</a>, <a href="/search/cs?searchtype=author&query=Xian%2C+X">Xingyu Xian</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Haomin Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qisen Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12809v1-abstract-short" style="display: inline;"> Uncertainty quantification in travel time estimation (TTE) aims to estimate the confidence interval for travel time, given the origin (O), destination (D), and departure time (T). Accurately quantifying this uncertainty requires generating the most likely path and assessing travel time uncertainty along the path. This involves two main challenges: 1) Predicting a path that aligns with the ground t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12809v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12809v1-abstract-full" style="display: none;"> Uncertainty quantification in travel time estimation (TTE) aims to estimate the confidence interval for travel time, given the origin (O), destination (D), and departure time (T). Accurately quantifying this uncertainty requires generating the most likely path and assessing travel time uncertainty along the path. This involves two main challenges: 1) Predicting a path that aligns with the ground truth, and 2) modeling the impact of travel time in each segment on overall uncertainty under varying conditions. We propose DutyTTE to address these challenges. For the first challenge, we introduce a deep reinforcement learning method to improve alignment between the predicted path and the ground truth, providing more accurate travel time information from road segments to improve TTE. For the second challenge, we propose a mixture of experts guided uncertainty quantification mechanism to better capture travel time uncertainty for each segment under varying contexts. Additionally, we calibrate our results using Hoeffding's upper-confidence bound to provide statistical guarantees for the estimated confidence intervals. Extensive experiments on two real-world datasets demonstrate the superiority of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12809v1-abstract-full').style.display = 'none'; document.getElementById('2408.12809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12253">arXiv:2408.12253</a> <span> [<a href="https://arxiv.org/pdf/2408.12253">pdf</a>, <a href="https://arxiv.org/format/2408.12253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Epsilon: Exploring Comprehensive Visual-Semantic Projection for Multi-Label Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziming Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingcai Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiaocheng Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12253v2-abstract-short" style="display: inline;"> This paper investigates a challenging problem of zero-shot learning in the multi-label scenario (MLZSL), wherein the model is trained to recognize multiple unseen classes within a sample (e.g., an image) based on seen classes and auxiliary knowledge, e.g., semantic information. Existing methods usually resort to analyzing the relationship of various seen classes residing in a sample from the dimen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12253v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12253v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12253v2-abstract-full" style="display: none;"> This paper investigates a challenging problem of zero-shot learning in the multi-label scenario (MLZSL), wherein the model is trained to recognize multiple unseen classes within a sample (e.g., an image) based on seen classes and auxiliary knowledge, e.g., semantic information. Existing methods usually resort to analyzing the relationship of various seen classes residing in a sample from the dimension of spatial or semantic characteristics and transferring the learned model to unseen ones. However, they neglect the integrity of local and global features. Although the use of the attention structure will accurately locate local features, especially objects, it will significantly lose its integrity, and the relationship between classes will also be affected. Rough processing of global features will also directly affect comprehensiveness. This neglect will make the model lose its grasp of the main components of the image. Relying only on the local existence of seen classes during the inference stage introduces unavoidable bias. In this paper, we propose a novel and comprehensive visual-semantic framework for MLZSL, dubbed Epsilon, to fully make use of such properties and enable a more accurate and robust visual-semantic projection. In terms of spatial information, we achieve effective refinement by group aggregating image features into several semantic prompts. It can aggregate semantic information rather than class information, preserving the correlation between semantics. In terms of global semantics, we use global forward propagation to collect as much information as possible to ensure that semantics are not omitted. Experiments on large-scale MLZSL benchmark datasets NUS-Wide and Open-Images-v4 demonstrate that the proposed Epsilon outperforms other state-of-the-art methods with large margins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12253v2-abstract-full').style.display = 'none'; document.getElementById('2408.12253v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures. arXiv admin note: substantial text overlap with arXiv:2309.00923</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12111">arXiv:2408.12111</a> <span> [<a href="https://arxiv.org/pdf/2408.12111">pdf</a>, <a href="https://arxiv.org/format/2408.12111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZipGait: Bridging Skeleton and Silhouette with Diffusion Model for Advancing Gait Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Min%2C+F">Fanxu Min</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Q">Qing Cai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shaoxiang Guo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yang Yu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hao Fan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Junyu Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12111v1-abstract-short" style="display: inline;"> Current gait recognition research predominantly focuses on extracting appearance features effectively, but the performance is severely compromised by the vulnerability of silhouettes under unconstrained scenes. Consequently, numerous studies have explored how to harness information from various models, particularly by sufficiently utilizing the intrinsic information of skeleton sequences. While th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12111v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12111v1-abstract-full" style="display: none;"> Current gait recognition research predominantly focuses on extracting appearance features effectively, but the performance is severely compromised by the vulnerability of silhouettes under unconstrained scenes. Consequently, numerous studies have explored how to harness information from various models, particularly by sufficiently utilizing the intrinsic information of skeleton sequences. While these model-based methods have achieved significant performance, there is still a huge gap compared to appearance-based methods, which implies the potential value of bridging silhouettes and skeletons. In this work, we make the first attempt to reconstruct dense body shapes from discrete skeleton distributions via the diffusion model, demonstrating a new approach that connects cross-modal features rather than focusing solely on intrinsic features to improve model-based methods. To realize this idea, we propose a novel gait diffusion model named DiffGait, which has been designed with four specific adaptations suitable for gait recognition. Furthermore, to effectively utilize the reconstructed silhouettes and skeletons, we introduce Perception Gait Integration (PGI) to integrate different gait features through a two-stage process. Incorporating those modifications leads to an efficient model-based gait recognition framework called ZipGait. Through extensive experiments on four public benchmarks, ZipGait demonstrates superior performance, outperforming the state-of-the-art methods by a large margin under both cross-domain and intra-domain settings, while achieving significant plug-and-play performance improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12111v1-abstract-full').style.display = 'none'; document.getElementById('2408.12111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10691">arXiv:2408.10691</a> <span> [<a href="https://arxiv.org/pdf/2408.10691">pdf</a>, <a href="https://arxiv.org/format/2408.10691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fine-Tuning and Deploying Large Language Models Over Edges: Issues and Approaches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yanjie Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haijun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengming Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiping Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10691v2-abstract-short" style="display: inline;"> Since the invention of GPT2--1.5B in 2019, large language models (LLMs) have transitioned from specialized models to versatile foundation models. The LLMs exhibit impressive zero-shot ability, however, require fine-tuning on local datasets and significant resources for deployment. Traditional fine-tuning techniques with the first-order optimizers require substantial GPU memory that exceeds mainstr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10691v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10691v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10691v2-abstract-full" style="display: none;"> Since the invention of GPT2--1.5B in 2019, large language models (LLMs) have transitioned from specialized models to versatile foundation models. The LLMs exhibit impressive zero-shot ability, however, require fine-tuning on local datasets and significant resources for deployment. Traditional fine-tuning techniques with the first-order optimizers require substantial GPU memory that exceeds mainstream hardware capability. Therefore, memory-efficient methods are motivated to be investigated. Model compression techniques can reduce energy consumption, operational costs, and environmental impact so that to support sustainable artificial intelligence advancements. Additionally, large-scale foundation models have expanded to create images, audio, videos, and multi-modal contents, further emphasizing the need for efficient deployment. Therefore, we are motivated to present a comprehensive overview of the prevalent memory-efficient fine-tuning methods over the network edge. We also review the state-of-the-art literatures on model compression to provide a vision on deploying LLMs over the network edge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10691v2-abstract-full').style.display = 'none'; document.getElementById('2408.10691v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08703">arXiv:2408.08703</a> <span> [<a href="https://arxiv.org/pdf/2408.08703">pdf</a>, <a href="https://arxiv.org/format/2408.08703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TsCA: On the Semantic Consistency Alignment via Conditional Transport for Compositional Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Miaoge Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingcai Guo</a>, <a href="/search/cs?searchtype=author&query=Da+Xu%2C+R+Y">Richard Yi Da Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaofeng Cao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08703v2-abstract-short" style="display: inline;"> Compositional Zero-Shot Learning (CZSL) aims to recognize novel \textit{state-object} compositions by leveraging the shared knowledge of their primitive components. Despite considerable progress, effectively calibrating the bias between semantically similar multimodal representations, as well as generalizing pre-trained knowledge to novel compositional contexts, remains an enduring challenge. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08703v2-abstract-full').style.display = 'inline'; document.getElementById('2408.08703v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08703v2-abstract-full" style="display: none;"> Compositional Zero-Shot Learning (CZSL) aims to recognize novel \textit{state-object} compositions by leveraging the shared knowledge of their primitive components. Despite considerable progress, effectively calibrating the bias between semantically similar multimodal representations, as well as generalizing pre-trained knowledge to novel compositional contexts, remains an enduring challenge. In this paper, our interest is to revisit the conditional transport (CT) theory and its homology to the visual-semantics interaction in CZSL and further, propose a novel Trisets Consistency Alignment framework (dubbed TsCA) that well-addresses these issues. Concretely, we utilize three distinct yet semantically homologous sets, i.e., patches, primitives, and compositions, to construct pairwise CT costs to minimize their semantic discrepancies. To further ensure the consistency transfer within these sets, we implement a cycle-consistency constraint that refines the learning by guaranteeing the feature consistency of the self-mapping during transport flow, regardless of modality. Moreover, we extend the CT plans to an open-world setting, which enables the model to effectively filter out unfeasible pairs, thereby speeding up the inference as well as increasing the accuracy. Extensive experiments are conducted to verify the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08703v2-abstract-full').style.display = 'none'; document.getElementById('2408.08703v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08274">arXiv:2408.08274</a> <span> [<a href="https://arxiv.org/pdf/2408.08274">pdf</a>, <a href="https://arxiv.org/format/2408.08274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BAM! Just Like That: Simple and Efficient Parameter Upcycling for Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qizhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Gritsch%2C+N">Nikolas Gritsch</a>, <a href="/search/cs?searchtype=author&query=Gnaneshwar%2C+D">Dwaraknath Gnaneshwar</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Simon Guo</a>, <a href="/search/cs?searchtype=author&query=Cairuz%2C+D">David Cairuz</a>, <a href="/search/cs?searchtype=author&query=Venkitesh%2C+B">Bharat Venkitesh</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+J">Jakob Foerster</a>, <a href="/search/cs?searchtype=author&query=Blunsom%2C+P">Phil Blunsom</a>, <a href="/search/cs?searchtype=author&query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&query=Ustun%2C+A">Ahmet Ustun</a>, <a href="/search/cs?searchtype=author&query=Locatelli%2C+A">Acyr Locatelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08274v2-abstract-short" style="display: inline;"> The Mixture of Experts (MoE) framework has become a popular architecture for large language models due to its superior performance over dense models. However, training MoEs from scratch in a large-scale regime is prohibitively expensive. Existing methods mitigate this by pre-training multiple dense expert models independently and using them to initialize an MoE. This is done by using experts' feed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08274v2-abstract-full').style.display = 'inline'; document.getElementById('2408.08274v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08274v2-abstract-full" style="display: none;"> The Mixture of Experts (MoE) framework has become a popular architecture for large language models due to its superior performance over dense models. However, training MoEs from scratch in a large-scale regime is prohibitively expensive. Existing methods mitigate this by pre-training multiple dense expert models independently and using them to initialize an MoE. This is done by using experts' feed-forward network (FFN) to initialize the MoE's experts while merging other parameters. However, this method limits the reuse of dense model parameters to only the FFN layers, thereby constraining the advantages when "upcycling" these models into MoEs. We propose BAM (Branch-Attend-Mix), a simple yet effective method that addresses this shortcoming. BAM makes full use of specialized dense models by not only using their FFN to initialize the MoE layers but also leveraging experts' attention parameters fully by initializing them into a soft-variant of Mixture of Attention (MoA) layers. We explore two methods for upcycling attention parameters: 1) initializing separate attention experts from dense models including all attention parameters for the best model performance; and 2) sharing key and value parameters across all experts to facilitate for better inference efficiency. To further improve efficiency, we adopt a parallel attention transformer architecture to MoEs, which allows the attention experts and FFN experts to be computed concurrently. Our experiments on seed models ranging from 590 million to 2 billion parameters demonstrate that BAM surpasses baselines in both perplexity and downstream task performance, within the same computational and data constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08274v2-abstract-full').style.display = 'none'; document.getElementById('2408.08274v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07966">arXiv:2408.07966</a> <span> [<a href="https://arxiv.org/pdf/2408.07966">pdf</a>, <a href="https://arxiv.org/format/2408.07966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Addressing Skewed Heterogeneity via Federated Prototype Rectification with Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shunxin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsong Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shuxia Lin</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+Z">Zhiqiang Kou</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xin Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07966v2-abstract-short" style="display: inline;"> Federated learning is an efficient framework designed to facilitate collaborative model training across multiple distributed devices while preserving user data privacy. A significant challenge of federated learning is data-level heterogeneity, i.e., skewed or long-tailed distribution of private data. Although various methods have been proposed to address this challenge, most of them assume that th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07966v2-abstract-full').style.display = 'inline'; document.getElementById('2408.07966v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07966v2-abstract-full" style="display: none;"> Federated learning is an efficient framework designed to facilitate collaborative model training across multiple distributed devices while preserving user data privacy. A significant challenge of federated learning is data-level heterogeneity, i.e., skewed or long-tailed distribution of private data. Although various methods have been proposed to address this challenge, most of them assume that the underlying global data is uniformly distributed across all clients. This paper investigates data-level heterogeneity federated learning with a brief review and redefines a more practical and challenging setting called Skewed Heterogeneous Federated Learning (SHFL). Accordingly, we propose a novel Federated Prototype Rectification with Personalization which consists of two parts: Federated Personalization and Federated Prototype Rectification. The former aims to construct balanced decision boundaries between dominant and minority classes based on private data, while the latter exploits both inter-class discrimination and intra-class consistency to rectify empirical prototypes. Experiments on three popular benchmarks show that the proposed approach outperforms current state-of-the-art methods and achieves balanced performance in both personalization and generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07966v2-abstract-full').style.display = 'none'; document.getElementById('2408.07966v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07344">arXiv:2408.07344</a> <span> [<a href="https://arxiv.org/pdf/2408.07344">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RTAT: A Robust Two-stage Association Tracker for Multi-Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Rujie Liu</a>, <a href="/search/cs?searchtype=author&query=Abe%2C+N">Narishige Abe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07344v1-abstract-short" style="display: inline;"> Data association is an essential part in the tracking-by-detection based Multi-Object Tracking (MOT). Most trackers focus on how to design a better data association strategy to improve the tracking performance. The rule-based handcrafted association methods are simple and highly efficient but lack generalization capability to deal with complex scenes. While the learnt association methods can learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07344v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07344v1-abstract-full" style="display: none;"> Data association is an essential part in the tracking-by-detection based Multi-Object Tracking (MOT). Most trackers focus on how to design a better data association strategy to improve the tracking performance. The rule-based handcrafted association methods are simple and highly efficient but lack generalization capability to deal with complex scenes. While the learnt association methods can learn high-order contextual information to deal with various complex scenes, but they have the limitations of higher complexity and cost. To address these limitations, we propose a Robust Two-stage Association Tracker, named RTAT. The first-stage association is performed between tracklets and detections to generate tracklets with high purity, and the second-stage association is performed between tracklets to form complete trajectories. For the first-stage association, we use a simple data association strategy to generate tracklets with high purity by setting a low threshold for the matching cost in the assignment process. We conduct the tracklet association in the second-stage based on the framework of message-passing GNN. Our method models the tracklet association as a series of edge classification problem in hierarchical graphs, which can recursively merge short tracklets into longer ones. Our tracker RTAT ranks first on the test set of MOT17 and MOT20 benchmarks in most of the main MOT metrics: HOTA, IDF1, and AssA. We achieve 67.2 HOTA, 84.7 IDF1, and 69.7 AssA on MOT17, and 66.2 HOTA, 82.5 IDF1, and 68.1 AssA on MOT20. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07344v1-abstract-full').style.display = 'none'; document.getElementById('2408.07344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04916">arXiv:2408.04916</a> <span> [<a href="https://arxiv.org/pdf/2408.04916">pdf</a>, <a href="https://arxiv.org/format/2408.04916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PTrajM: Efficient and Semantic-rich Trajectory Learning with Pretrained Trajectory-Mamba </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yan Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zeyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Haomin Wen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+E">Erwen Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youfang Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04916v1-abstract-short" style="display: inline;"> Vehicle trajectories provide crucial movement information for various real-world applications. To better utilize vehicle trajectories, it is essential to develop a trajectory learning approach that can effectively and efficiently extract rich semantic information, including movement behavior and travel purposes, to support accurate downstream applications. However, creating such an approach presen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04916v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04916v1-abstract-full" style="display: none;"> Vehicle trajectories provide crucial movement information for various real-world applications. To better utilize vehicle trajectories, it is essential to develop a trajectory learning approach that can effectively and efficiently extract rich semantic information, including movement behavior and travel purposes, to support accurate downstream applications. However, creating such an approach presents two significant challenges. First, movement behavior are inherently spatio-temporally continuous, making them difficult to extract efficiently from irregular and discrete trajectory points. Second, travel purposes are related to the functionalities of areas and road segments traversed by vehicles. These functionalities are not available from the raw spatio-temporal trajectory features and are hard to extract directly from complex textual features associated with these areas and road segments. To address these challenges, we propose PTrajM, a novel method capable of efficient and semantic-rich vehicle trajectory learning. To support efficient modeling of movement behavior, we introduce Trajectory-Mamba as the learnable model of PTrajM, which effectively extracts continuous movement behavior while being more computationally efficient than existing structures. To facilitate efficient extraction of travel purposes, we propose a travel purpose-aware pre-training procedure, which enables PTrajM to discern the travel purposes of trajectories without additional computational resources during its embedding process. Extensive experiments on two real-world datasets and comparisons with several state-of-the-art trajectory learning methods demonstrate the effectiveness of PTrajM. Code is available at https://anonymous.4open.science/r/PTrajM-C973. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04916v1-abstract-full').style.display = 'none'; document.getElementById('2408.04916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04879">arXiv:2408.04879</a> <span> [<a href="https://arxiv.org/pdf/2408.04879">pdf</a>, <a href="https://arxiv.org/format/2408.04879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On the Element-Wise Representation and Reasoning in Zero-Shot Image Recognition: A Systematic Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingcai Guo</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+Z">Zhijie Rao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Song Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04879v3-abstract-short" style="display: inline;"> Zero-shot image recognition (ZSIR) aims to recognize and reason in unseen domains by learning generalized knowledge from limited data in the seen domain. The gist of ZSIR is constructing a well-aligned mapping between the input visual space and the target semantic space, which is a bottom-up paradigm inspired by the process by which humans observe the world. In recent years, ZSIR has witnessed sig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04879v3-abstract-full').style.display = 'inline'; document.getElementById('2408.04879v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04879v3-abstract-full" style="display: none;"> Zero-shot image recognition (ZSIR) aims to recognize and reason in unseen domains by learning generalized knowledge from limited data in the seen domain. The gist of ZSIR is constructing a well-aligned mapping between the input visual space and the target semantic space, which is a bottom-up paradigm inspired by the process by which humans observe the world. In recent years, ZSIR has witnessed significant progress on a broad spectrum, from theory to algorithm design, as well as widespread applications. However, to the best of our knowledge, there remains a lack of a systematic review of ZSIR from an element-wise perspective, i.e., learning fine-grained elements of data and their inferential associations. To fill the gap, this paper thoroughly investigates recent advances in element-wise ZSIR and provides a sound basis for its future development. Concretely, we first integrate three basic ZSIR tasks, i.e., object recognition, compositional recognition, and foundation model-based open-world recognition, into a unified element-wise paradigm and provide a detailed taxonomy and analysis of the main approaches. Next, we summarize the benchmarks, covering technical implementations, standardized datasets, and some more details as a library. Last, we sketch out related applications, discuss vital challenges, and suggest potential future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04879v3-abstract-full').style.display = 'none'; document.getElementById('2408.04879v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures, and 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03631">arXiv:2408.03631</a> <span> [<a href="https://arxiv.org/pdf/2408.03631">pdf</a>, <a href="https://arxiv.org/ps/2408.03631">ps</a>, <a href="https://arxiv.org/format/2408.03631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Base Station Siting: Intelligent Deployment based on Prompt or Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanhu Wang</a>, <a href="/search/cs?searchtype=author&query=Afzal%2C+M+M">Muhammad Muzammil Afzal</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chenyuan Feng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuaishuai Guo</a>, <a href="/search/cs?searchtype=author&query=Quek%2C+T+Q+S">Tony Q. S. Quek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03631v1-abstract-short" style="display: inline;"> Traditional base station siting (BSS) methods rely heavily on drive testing and user feedback, which are laborious and require extensive expertise in communication, networking, and optimization. As large language models (LLMs) and their associated technologies advance, particularly in the realms of prompt engineering and agent engineering, network optimization will witness a revolutionary approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03631v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03631v1-abstract-full" style="display: none;"> Traditional base station siting (BSS) methods rely heavily on drive testing and user feedback, which are laborious and require extensive expertise in communication, networking, and optimization. As large language models (LLMs) and their associated technologies advance, particularly in the realms of prompt engineering and agent engineering, network optimization will witness a revolutionary approach. This approach entails the strategic use of well-crafted prompts to infuse human experience and knowledge into these sophisticated LLMs, and the deployment of autonomous agents as a communication bridge to seamlessly connect the machine language based LLMs with human users using natural language. This integration represents the future paradigm of artificial intelligence (AI) as a service and AI for more ease. As a preliminary exploration, this research first develops a novel LLM-empowered BSS optimization framework, and heuristically proposes four different potential implementations: the strategies based on Prompt-optimized LLM (PoL), human-in-the-Loop LLM (HiLL), LLM-empowered autonomous BSS agent (LaBa), and Cooperative multiple LLM-based autonomous BSS agents (CLaBa). Through evaluation on real-world data, the experiments demonstrate that prompt-assisted LLMs and LLM-based agents can generate more efficient, cost-effective, and reliable network deployments, noticeably enhancing the efficiency of BSS optimization and reducing trivial manual participation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03631v1-abstract-full').style.display = 'none'; document.getElementById('2408.03631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Guo%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository