Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 948 results for author: <span class="mathjax">Lu, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lu%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lu, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lu%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lu, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lu%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14131">arXiv:2411.14131</a> <span> [<a href="https://arxiv.org/pdf/2411.14131">pdf</a>, <a href="https://arxiv.org/format/2411.14131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> sEMG-based Gesture-Free Hand Intention Recognition: System, Dataset, Toolbox, and Benchmark Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongxin Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jingsheng Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuechao Xu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaru Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junhao Xiao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huimin Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zongtan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14131v1-abstract-short" style="display: inline;"> In sensitive scenarios, such as meetings, negotiations, and team sports, messages must be conveyed without detection by non-collaborators. Previous methods, such as encrypting messages, eye contact, and micro-gestures, had problems with either inaccurate information transmission or leakage of interaction intentions. To this end, a novel gesture-free hand intention recognition scheme was proposed,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14131v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14131v1-abstract-full" style="display: none;"> In sensitive scenarios, such as meetings, negotiations, and team sports, messages must be conveyed without detection by non-collaborators. Previous methods, such as encrypting messages, eye contact, and micro-gestures, had problems with either inaccurate information transmission or leakage of interaction intentions. To this end, a novel gesture-free hand intention recognition scheme was proposed, that adopted surface electromyography(sEMG) and isometric contraction theory to recognize different hand intentions without any gesture. Specifically, this work includes four parts: (1) the experimental system, consisting of the upper computer software, self-conducted myoelectric watch, and sports platform, is built to get sEMG signals and simulate multiple usage scenarios; (2) the paradigm is designed to standard prompt and collect the gesture-free sEMG datasets. Eight-channel signals of ten subjects were recorded twice per subject at about 5-10 days intervals; (3) the toolbox integrates preprocessing methods (data segmentation, filter, normalization, etc.), commonly used sEMG signal decoding methods, and various plotting functions, to facilitate the research of the dataset; (4) the benchmark results of widely used methods are provided. The results involve single-day, cross-day, and cross-subject experiments of 6-class and 12-class gesture-free hand intention when subjects with different sports motions. To help future research, all data, hardware, software, and methods are open-sourced on the following website: click here. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14131v1-abstract-full').style.display = 'none'; document.getElementById('2411.14131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13892">arXiv:2411.13892</a> <span> [<a href="https://arxiv.org/pdf/2411.13892">pdf</a>, <a href="https://arxiv.org/format/2411.13892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Topology-Aware Popularity Debiasing via Simplicial Complexes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yanbiao Ji</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yue Ding</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuxiang Lu</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+X">Xin Xin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongtao Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13892v1-abstract-short" style="display: inline;"> Recommender systems (RS) play a critical role in delivering personalized content across various online platforms, leveraging collaborative filtering (CF) as a key technique to generate recommendations based on users' historical interaction data. Recent advancements in CF have been driven by the adoption of Graph Neural Networks (GNNs), which model user-item interactions as bipartite graphs, enabli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13892v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13892v1-abstract-full" style="display: none;"> Recommender systems (RS) play a critical role in delivering personalized content across various online platforms, leveraging collaborative filtering (CF) as a key technique to generate recommendations based on users' historical interaction data. Recent advancements in CF have been driven by the adoption of Graph Neural Networks (GNNs), which model user-item interactions as bipartite graphs, enabling the capture of high-order collaborative signals. Despite their success, GNN-based methods face significant challenges due to the inherent popularity bias in the user-item interaction graph's topology, leading to skewed recommendations that favor popular items over less-known ones. To address this challenge, we propose a novel topology-aware popularity debiasing framework, Test-time Simplicial Propagation (TSP), which incorporates simplicial complexes (SCs) to enhance the expressiveness of GNNs. Unlike traditional methods that focus on pairwise relationships, our approach captures multi-order relationships through SCs, providing a more comprehensive representation of user-item interactions. By enriching the neighborhoods of tail items and leveraging SCs for feature smoothing, TSP enables the propagation of multi-order collaborative signals and effectively mitigates biased propagation. Our TSP module is designed as a plug-and-play solution, allowing for seamless integration into pre-trained GNN-based models without the need for fine-tuning additional parameters. Extensive experiments on five real-world datasets demonstrate the superior performance of our method, particularly in long-tail recommendation tasks. Visualization results further confirm that TSP produces more uniform distributions of item representations, leading to fairer and more accurate recommendations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13892v1-abstract-full').style.display = 'none'; document.getElementById('2411.13892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11581">arXiv:2411.11581</a> <span> [<a href="https://arxiv.org/pdf/2411.11581">pdf</a>, <a href="https://arxiv.org/format/2411.11581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OASIS: Open Agents Social Interaction Simulations on One Million Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zirui Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxian Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziyue Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinsong Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Martz Ma</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11581v2-abstract-short" style="display: inline;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11581v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11581v2-abstract-full" style="display: none;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a particular scenario, making it time-consuming and resource-intensive to explore other phenomena using the same ABM. Additionally, these models simulate only a limited number of agents, whereas real-world social media platforms involve millions of users. To this end, we propose OASIS, a generalizable and scalable social media simulator. OASIS is designed based on real-world social media platforms, incorporating dynamically updated environments (i.e., dynamic social networks and post information), diverse action spaces (i.e., following, commenting), and recommendation systems (i.e., interest-based and hot-score-based). Additionally, OASIS supports large-scale user simulations, capable of modeling up to one million users. With these features, OASIS can be easily extended to different social media platforms to study large-scale group phenomena and behaviors. We replicate various social phenomena, including information spreading, group polarization, and herd effects across X and Reddit platforms. Moreover, we provide observations of social phenomena at different agent group scales. We observe that the larger agent group scale leads to more enhanced group dynamics and more diverse and helpful agents' opinions. These findings demonstrate OASIS's potential as a powerful tool for studying complex systems in digital environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v2-abstract-full').style.display = 'none'; document.getElementById('2411.11581v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11435">arXiv:2411.11435</a> <span> [<a href="https://arxiv.org/pdf/2411.11435">pdf</a>, <a href="https://arxiv.org/format/2411.11435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GLDesigner: Leveraging Multi-Modal LLMs as Designer for Enhanced Aesthetic Text Glyph Layouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Junwen He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jun-Yan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jin-Peng Lan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yifeng Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11435v1-abstract-short" style="display: inline;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11435v1-abstract-full" style="display: none;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper, we propose a VLM-based framework that generates content-aware text logo layouts by integrating multi-modal inputs with user constraints, supporting a more flexible and stable layout design in real-world applications. We introduce two model techniques to reduce the computation for processing multiple glyph images simultaneously, while does not face performance degradation. To support instruction-tuning of out model, we construct two extensive text logo datasets, which are 5x more larger than the existing public dataset. Except for the geometric annotations (e.g. text masks and character recognition), we also compliment with comprehensive layout descriptions in natural language format, for more effective training to have reasoning ability when dealing with complex layouts and custom user constraints. Experimental studies demonstrate the effectiveness of our proposed model and datasets, when comparing with previous methods in various benchmarks to evaluate geometric aesthetics and human preferences. The code and datasets will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'none'; document.getElementById('2411.11435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08516">arXiv:2411.08516</a> <span> [<a href="https://arxiv.org/pdf/2411.08516">pdf</a>, <a href="https://arxiv.org/format/2411.08516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Tree-of-Table: Unleashing the Power of LLMs for Enhanced Large-Scale Table Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+D">Deyi Ji</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Siqi Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peng Xu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongtao Lu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Feng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08516v1-abstract-short" style="display: inline;"> The ubiquity and value of tables as semi-structured data across various domains necessitate advanced methods for understanding their complexity and vast amounts of information. Despite the impressive capabilities of large language models (LLMs) in advancing the natural language understanding frontier, their application to large-scale tabular data presents significant challenges, specifically regar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08516v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08516v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08516v1-abstract-full" style="display: none;"> The ubiquity and value of tables as semi-structured data across various domains necessitate advanced methods for understanding their complexity and vast amounts of information. Despite the impressive capabilities of large language models (LLMs) in advancing the natural language understanding frontier, their application to large-scale tabular data presents significant challenges, specifically regarding table size and complex intricate relationships. Existing works have shown promise with small-scale tables but often flounder when tasked with the complex reasoning required by larger, interconnected tables found in real-world scenarios. To address this gap, we introduce "Tree-of-Table", a novel approach designed to enhance LLMs' reasoning capabilities over large and complex tables. Our method employs Table Condensation and Decomposition to distill and reorganize relevant data into a manageable format, followed by the construction of a hierarchical Table-Tree that facilitates tree-structured reasoning. Through a meticulous Table-Tree Execution process, we systematically unravel the tree-structured reasoning chain to derive the solutions. Experiments across diverse datasets, including WikiTQ, TableFact, FeTaQA, and BIRD, demonstrate that Tree-of-Table sets a new benchmark with superior performance, showcasing remarkable efficiency and generalization capabilities in large-scale table reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08516v1-abstract-full').style.display = 'none'; document.getElementById('2411.08516v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04844">arXiv:2411.04844</a> <span> [<a href="https://arxiv.org/pdf/2411.04844">pdf</a>, <a href="https://arxiv.org/format/2411.04844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Gaussian Representation for Incomplete CT Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaokai Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuxiang Lu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Suizhi Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fengyu Yang</a>, <a href="/search/cs?searchtype=author&query=Sirejiding%2C+S">Shalayiding Sirejiding</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qichen He</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+J">Jing Tong</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yanbiao Ji</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yue Ding</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongtao Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04844v1-abstract-short" style="display: inline;"> Incomplete Computed Tomography (CT) benefits patients by reducing radiation exposure. However, reconstructing high-fidelity images from limited views or angles remains challenging due to the ill-posed nature of the problem. Deep Learning Reconstruction (DLR) methods have shown promise in enhancing image quality, but the paradox between training data diversity and high generalization ability remain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04844v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04844v1-abstract-full" style="display: none;"> Incomplete Computed Tomography (CT) benefits patients by reducing radiation exposure. However, reconstructing high-fidelity images from limited views or angles remains challenging due to the ill-posed nature of the problem. Deep Learning Reconstruction (DLR) methods have shown promise in enhancing image quality, but the paradox between training data diversity and high generalization ability remains unsolved. In this paper, we propose a novel Gaussian Representation for Incomplete CT Reconstruction (GRCT) without the usage of any neural networks or full-dose CT data. Specifically, we model the 3D volume as a set of learnable Gaussians, which are optimized directly from the incomplete sinogram. Our method can be applied to multiple views and angles without changing the architecture. Additionally, we propose a differentiable Fast CT Reconstruction method for efficient clinical usage. Extensive experiments on multiple datasets and settings demonstrate significant improvements in reconstruction quality metrics and high efficiency. We plan to release our code as open-source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04844v1-abstract-full').style.display = 'none'; document.getElementById('2411.04844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01200">arXiv:2411.01200</a> <span> [<a href="https://arxiv.org/pdf/2411.01200">pdf</a>, <a href="https://arxiv.org/format/2411.01200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> GarmentLab: A Unified Simulation and Benchmark for Garment Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoran Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruihai Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yitong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijie Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Ziyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+C">Chuanruo Ning</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yan Shen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Longzan Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuanpei Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01200v1-abstract-short" style="display: inline;"> Manipulating garments and fabrics has long been a critical endeavor in the development of home-assistant robots. However, due to complex dynamics and topological structures, garment manipulations pose significant challenges. Recent successes in reinforcement learning and vision-based methods offer promising avenues for learning garment manipulation. Nevertheless, these approaches are severely cons… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01200v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01200v1-abstract-full" style="display: none;"> Manipulating garments and fabrics has long been a critical endeavor in the development of home-assistant robots. However, due to complex dynamics and topological structures, garment manipulations pose significant challenges. Recent successes in reinforcement learning and vision-based methods offer promising avenues for learning garment manipulation. Nevertheless, these approaches are severely constrained by current benchmarks, which offer limited diversity of tasks and unrealistic simulation behavior. Therefore, we present GarmentLab, a content-rich benchmark and realistic simulation designed for deformable object and garment manipulation. Our benchmark encompasses a diverse range of garment types, robotic systems and manipulators. The abundant tasks in the benchmark further explores of the interactions between garments, deformable objects, rigid bodies, fluids, and human body. Moreover, by incorporating multiple simulation methods such as FEM and PBD, along with our proposed sim-to-real algorithms and real-world benchmark, we aim to significantly narrow the sim-to-real gap. We evaluate state-of-the-art vision methods, reinforcement learning, and imitation learning approaches on these tasks, highlighting the challenges faced by current algorithms, notably their limited generalization capabilities. Our proposed open-source environments and comprehensive analysis show promising boost to future research in garment manipulation by unlocking the full potential of these methods. We guarantee that we will open-source our code as soon as possible. You can watch the videos in supplementary files to learn more about the details of our work. Our project page is available at: https://garmentlab.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01200v1-abstract-full').style.display = 'none'; document.getElementById('2411.01200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01141">arXiv:2411.01141</a> <span> [<a href="https://arxiv.org/pdf/2411.01141">pdf</a>, <a href="https://arxiv.org/format/2411.01141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dictionary Insertion Prompting for Multilingual Reasoning on Multilingual Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zixuan Li</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+W">Wai Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01141v1-abstract-short" style="display: inline;"> As current training data for Large Language Models (LLMs) are dominated by English corpus, they are English-centric and they present impressive performance on English reasoning tasks.\footnote{This paper primarily studies English-centric models, but our method could be universal by using the centric language in the dictionary for non-English-centric LLMs.} Yet, they usually suffer from lower perfo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01141v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01141v1-abstract-full" style="display: none;"> As current training data for Large Language Models (LLMs) are dominated by English corpus, they are English-centric and they present impressive performance on English reasoning tasks.\footnote{This paper primarily studies English-centric models, but our method could be universal by using the centric language in the dictionary for non-English-centric LLMs.} Yet, they usually suffer from lower performance in other languages. There are about 7,000 languages over the world, and many are low-resourced on English-centric LLMs. For the sake of people who primarily speak these languages, it is especially urgent to enable our LLMs in those languages. Model training is usually effective, but computationally expensive and requires experienced NLP practitioners. This paper presents a novel and simple yet effective method called \textbf{D}ictionary \textbf{I}nsertion \textbf{P}rompting (\textbf{DIP}). When providing a non-English prompt, DIP looks up a word dictionary and inserts words' English counterparts into the prompt for LLMs. It then enables better translation into English and better English model thinking steps which leads to obviously better results. We experiment with about 200 languages from FLORES-200. Since there are no adequate datasets, we use the NLLB translator to create synthetic multilingual benchmarks from the existing 4 English reasoning benchmarks such as GSM8K and AQuA. Despite the simplicity and computationally lightweight, we surprisingly found the effectiveness of DIP on math and commonsense reasoning tasks on multiple open-source and close-source LLMs.\footnote{Our dictionaries, code, and synthetic benchmarks will be open-sourced to facilitate future research.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01141v1-abstract-full').style.display = 'none'; document.getElementById('2411.01141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21287">arXiv:2410.21287</a> <span> [<a href="https://arxiv.org/pdf/2410.21287">pdf</a>, <a href="https://arxiv.org/format/2410.21287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Assessment of OpenAI o1-Preview for Higher Order Thinking in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Latif%2C+E">Ehsan Latif</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yizhu Gao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lehong Shi</a>, <a href="/search/cs?searchtype=author&query=Nayaaba%2C+M">Matthew Nayaaba</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeonggeon Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bewersdorff%2C+A">Arne Bewersdorff</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Luyang Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiantong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoran Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxi Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jichao Yu</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Weihang You</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+V+S">Vincent Shung Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jin Lu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+F">Fei Dou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Ping Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21287v1-abstract-short" style="display: inline;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21287v1-abstract-full" style="display: none;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacognition, data literacy, creative thinking, abstract reasoning, quantitative reasoning, logical reasoning, analogical reasoning, and scientific reasoning. We used validated instruments like the Ennis-Weir Critical Thinking Essay Test and the Biological Systems Thinking Test to compare the o1-preview's performance with human performance systematically. Our findings reveal that o1-preview outperforms humans in most categories, achieving 150% better results in systems thinking, computational thinking, data literacy, creative thinking, scientific reasoning, and abstract reasoning. However, compared to humans, it underperforms by around 25% in logical reasoning, critical thinking, and quantitative reasoning. In analogical reasoning, both o1-preview and humans achieved perfect scores. Despite these strengths, the o1-preview shows limitations in abstract reasoning, where human psychology students outperform it, highlighting the continued importance of human oversight in tasks requiring high-level abstraction. These results have significant educational implications, suggesting a shift toward developing human skills that complement AI, such as creativity, abstract reasoning, and critical thinking. This study emphasizes the transformative potential of AI in education and calls for a recalibration of educational goals, teaching methods, and curricula to align with an AI-driven world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'none'; document.getElementById('2410.21287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An assessment of OpenAI o1-Preview for Higher Order Thinking in Education</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21142">arXiv:2410.21142</a> <span> [<a href="https://arxiv.org/pdf/2410.21142">pdf</a>, <a href="https://arxiv.org/format/2410.21142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Modeling and Monitoring of Indoor Populations using Sparse Positioning Data (Extension) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huan Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hua Lu</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+C+S">Christian S. Jensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21142v2-abstract-short" style="display: inline;"> In large venues like shopping malls and airports, knowledge on the indoor populations fuels applications such as business analytics, venue management, and safety control. In this work, we provide means of modeling populations in partitions of indoor space offline and of monitoring indoor populations continuously, by using indoor positioning data. However, the low-sampling rates of indoor positioni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21142v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21142v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21142v2-abstract-full" style="display: none;"> In large venues like shopping malls and airports, knowledge on the indoor populations fuels applications such as business analytics, venue management, and safety control. In this work, we provide means of modeling populations in partitions of indoor space offline and of monitoring indoor populations continuously, by using indoor positioning data. However, the low-sampling rates of indoor positioning render the data temporally and spatially sparse, which in turn renders the offline capture of indoor populations challenging. It is even more challenging to continuously monitor indoor populations, as positioning data may be missing or not ready yet at the current moment. To address these challenges, we first enable probabilistic modeling of populations in indoor space partitions as Normal distributions. Based on that, we propose two learning-based estimators for on-the-fly prediction of population distributions. Leveraging the prediction-based schemes, we provide a unified continuous query processing framework for a type of query that enables continuous monitoring of populated partitions. The framework encompasses caching and result validity mechanisms to reduce cost and maintain monitoring effectiveness. Extensive experiments on two real data sets show that the proposed estimators are able to outperform the state-of-the-art alternatives and that the query processing framework is effective and efficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21142v2-abstract-full').style.display = 'none'; document.getElementById('2410.21142v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at TKDE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20927">arXiv:2410.20927</a> <span> [<a href="https://arxiv.org/pdf/2410.20927">pdf</a>, <a href="https://arxiv.org/format/2410.20927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VLMimic: Vision Language Models are Visual Imitation Learner for Fine-grained Actions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanyan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meiling Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+T">Te Cui</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoyang Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianxing Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zicai Peng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mengxiao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yufeng Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20927v3-abstract-short" style="display: inline;"> Visual imitation learning (VIL) provides an efficient and intuitive strategy for robotic systems to acquire novel skills. Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable performance in vision and language reasoning capabilities for VIL tasks. Despite the progress, current VIL methods naively employ VLMs to learn high-level plans from human videos, relying on pre-d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20927v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20927v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20927v3-abstract-full" style="display: none;"> Visual imitation learning (VIL) provides an efficient and intuitive strategy for robotic systems to acquire novel skills. Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable performance in vision and language reasoning capabilities for VIL tasks. Despite the progress, current VIL methods naively employ VLMs to learn high-level plans from human videos, relying on pre-defined motion primitives for executing physical interactions, which remains a major bottleneck. In this work, we present VLMimic, a novel paradigm that harnesses VLMs to directly learn even fine-grained action levels, only given a limited number of human videos. Specifically, VLMimic first grounds object-centric movements from human videos, and learns skills using hierarchical constraint representations, facilitating the derivation of skills with fine-grained action levels from limited human videos. These skills are refined and updated through an iterative comparison strategy, enabling efficient adaptation to unseen environments. Our extensive experiments exhibit that our VLMimic, using only 5 human videos, yields significant improvements of over 27% and 21% in RLBench and real-world manipulation tasks, and surpasses baselines by over 37% in long-horizon tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20927v3-abstract-full').style.display = 'none'; document.getElementById('2410.20927v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted for publication in the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20178">arXiv:2410.20178</a> <span> [<a href="https://arxiv.org/pdf/2410.20178">pdf</a>, <a href="https://arxiv.org/format/2410.20178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LLMs Can Evolve Continually on Modality for X-Modal Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiazuo Yu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haomiao Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Zhuge%2C+Y">Yunzhi Zhuge</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">You He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20178v2-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have gained significant attention due to their impressive capabilities in multimodal understanding. However, existing methods rely heavily on extensive modal-specific pretraining and joint-modal tuning, leading to significant computational burdens when expanding to new modalities. In this paper, we propose PathWeave, a flexible and scalable framework with m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20178v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20178v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20178v2-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have gained significant attention due to their impressive capabilities in multimodal understanding. However, existing methods rely heavily on extensive modal-specific pretraining and joint-modal tuning, leading to significant computational burdens when expanding to new modalities. In this paper, we propose PathWeave, a flexible and scalable framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We leverage the concept of Continual Learning and develop an incremental training strategy atop pre-trained MLLMs, enabling their expansion to new modalities using uni-modal data, without executing joint-modal pretraining. In detail, a novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and cross-modal adapters are seamlessly integrated to facilitate efficient modality alignment and collaboration. Additionally, an MoE-based gating module is applied between two types of adapters to further enhance the multimodal interaction. To investigate the proposed method, we establish a challenging benchmark called Continual Learning of Modality (MCL), which consists of high-quality QA data from five distinct modalities: image, video, audio, depth and point cloud. Extensive experiments demonstrate the effectiveness of the proposed AnA framework on learning plasticity and memory stability during continual learning. Furthermore, PathWeave performs comparably to state-of-the-art MLLMs while concurrently reducing parameter training burdens by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20178v2-abstract-full').style.display = 'none'; document.getElementById('2410.20178v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17233">arXiv:2410.17233</a> <span> [<a href="https://arxiv.org/pdf/2410.17233">pdf</a>, <a href="https://arxiv.org/format/2410.17233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Few-shot In-Context Preference Learning Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hong Lu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiaxuan Gao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Q">Qixin Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinting Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a>, <a href="/search/cs?searchtype=author&query=Vinitsky%2C+E">Eugene Vinitsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17233v1-abstract-short" style="display: inline;"> Designing reward functions is a core component of reinforcement learning but can be challenging for truly complex behavior. Reinforcement Learning from Human Feedback (RLHF) has been used to alleviate this challenge by replacing a hand-coded reward function with a reward function learned from preferences. However, it can be exceedingly inefficient to learn these rewards as they are often learned t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17233v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17233v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17233v1-abstract-full" style="display: none;"> Designing reward functions is a core component of reinforcement learning but can be challenging for truly complex behavior. Reinforcement Learning from Human Feedback (RLHF) has been used to alleviate this challenge by replacing a hand-coded reward function with a reward function learned from preferences. However, it can be exceedingly inefficient to learn these rewards as they are often learned tabula rasa. We investigate whether Large Language Models (LLMs) can reduce this query inefficiency by converting an iterative series of human preferences into code representing the rewards. We propose In-Context Preference Learning (ICPL), a method that uses the grounding of an LLM to accelerate learning reward functions from preferences. ICPL takes the environment context and task description, synthesizes a set of reward functions, and then repeatedly updates the reward functions using human rankings of videos of the resultant policies. Using synthetic preferences, we demonstrate that ICPL is orders of magnitude more efficient than RLHF and is even competitive with methods that use ground-truth reward functions instead of preferences. Finally, we perform a series of human preference-learning trials and observe that ICPL extends beyond synthetic settings and can work effectively with humans-in-the-loop. Additional information and videos are provided at https://sites.google.com/view/few-shot-icpl/home. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17233v1-abstract-full').style.display = 'none'; document.getElementById('2410.17233v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16166">arXiv:2410.16166</a> <span> [<a href="https://arxiv.org/pdf/2410.16166">pdf</a>, <a href="https://arxiv.org/format/2410.16166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Filtering: Adaptive Image-Text Quality Enhancement for MLLM Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Han Huang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijia Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoyu Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16166v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have made significant strides by integrating visual and textual modalities. A critical factor in training MLLMs is the quality of image-text pairs within multimodal pretraining datasets. However, $\textit {de facto}$ filter-based data quality enhancement paradigms often discard a substantial portion of high-quality image data due to inadequate semantic alig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16166v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have made significant strides by integrating visual and textual modalities. A critical factor in training MLLMs is the quality of image-text pairs within multimodal pretraining datasets. However, $\textit {de facto}$ filter-based data quality enhancement paradigms often discard a substantial portion of high-quality image data due to inadequate semantic alignment between images and texts, leading to inefficiencies in data utilization and scalability. In this paper, we propose the Adaptive Image-Text Quality Enhancer (AITQE), a model that dynamically assesses and enhances the quality of image-text pairs. AITQE employs a text rewriting mechanism for low-quality pairs and incorporates a negative sample learning strategy to improve evaluative capabilities by integrating deliberately selected low-quality samples during training. Unlike prior approaches that significantly alter text distributions, our method minimally adjusts text to preserve data volume while enhancing quality. Experimental results demonstrate that AITQE surpasses existing methods on various benchmark, effectively leveraging raw data and scaling efficiently with increasing data volumes. We hope our work will inspire future works. The code and model are available at: https://github.com/hanhuang22/AITQE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16166v1-abstract-full').style.display = 'none'; document.getElementById('2410.16166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15526">arXiv:2410.15526</a> <span> [<a href="https://arxiv.org/pdf/2410.15526">pdf</a>, <a href="https://arxiv.org/format/2410.15526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> SDP4Bit: Toward 4-bit Communication Quantization in Sharded Data Parallelism for LLM Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jinda Jia</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Cong Xie</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hanlin Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daoce Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hao Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengming Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Baixi Sun</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haibin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dingwen Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15526v1-abstract-short" style="display: inline;"> Recent years have witnessed a clear trend towards language models with an ever-increasing number of parameters, as well as the growing training overhead and memory usage. Distributed training, particularly through Sharded Data Parallelism (ShardedDP) which partitions optimizer states among workers, has emerged as a crucial technique to mitigate training time and memory usage. Yet, a major challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15526v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15526v1-abstract-full" style="display: none;"> Recent years have witnessed a clear trend towards language models with an ever-increasing number of parameters, as well as the growing training overhead and memory usage. Distributed training, particularly through Sharded Data Parallelism (ShardedDP) which partitions optimizer states among workers, has emerged as a crucial technique to mitigate training time and memory usage. Yet, a major challenge in the scalability of ShardedDP is the intensive communication of weights and gradients. While compression techniques can alleviate this issue, they often result in worse accuracy. Driven by this limitation, we propose SDP4Bit (Toward 4Bit Communication Quantization in Sharded Data Parallelism for LLM Training), which effectively reduces the communication of weights and gradients to nearly 4 bits via two novel techniques: quantization on weight differences, and two-level gradient smooth quantization. Furthermore, SDP4Bit presents an algorithm-system co-design with runtime optimization to minimize the computation overhead of compression. In addition to the theoretical guarantees of convergence, we empirically evaluate the accuracy of SDP4Bit on the pre-training of GPT models with up to 6.7 billion parameters, and the results demonstrate a negligible impact on training loss. Furthermore, speed experiments show that SDP4Bit achieves up to 4.08$\times$ speedup in end-to-end throughput on a scale of 128 GPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15526v1-abstract-full').style.display = 'none'; document.getElementById('2410.15526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15266">arXiv:2410.15266</a> <span> [<a href="https://arxiv.org/pdf/2410.15266">pdf</a>, <a href="https://arxiv.org/format/2410.15266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> GSSF: Generalized Structural Sparse Function for Deep Cross-modal Metric Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shang Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiawen Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15266v1-abstract-short" style="display: inline;"> Cross-modal metric learning is a prominent research topic that bridges the semantic heterogeneity between vision and language. Existing methods frequently utilize simple cosine or complex distance metrics to transform the pairwise features into a similarity score, which suffers from an inadequate or inefficient capability for distance measurements. Consequently, we propose a Generalized Structural… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15266v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15266v1-abstract-full" style="display: none;"> Cross-modal metric learning is a prominent research topic that bridges the semantic heterogeneity between vision and language. Existing methods frequently utilize simple cosine or complex distance metrics to transform the pairwise features into a similarity score, which suffers from an inadequate or inefficient capability for distance measurements. Consequently, we propose a Generalized Structural Sparse Function to dynamically capture thorough and powerful relationships across modalities for pair-wise similarity learning while remaining concise but efficient. Specifically, the distance metric delicately encapsulates two formats of diagonal and block-diagonal terms, automatically distinguishing and highlighting the cross-channel relevancy and dependency inside a structured and organized topology. Hence, it thereby empowers itself to adapt to the optimal matching patterns between the paired features and reaches a sweet spot between model complexity and capability. Extensive experiments on cross-modal and two extra uni-modal retrieval tasks (image-text retrieval, person re-identification, fine-grained image retrieval) have validated its superiority and flexibility over various popular retrieval frameworks. More importantly, we further discover that it can be seamlessly incorporated into multiple application scenarios, and demonstrates promising prospects from Attention Mechanism to Knowledge Distillation in a plug-and-play manner. Our code is publicly available at: https://github.com/Paranioar/GSSF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15266v1-abstract-full').style.display = 'none'; document.getElementById('2410.15266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures, Accepted by TIP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15042">arXiv:2410.15042</a> <span> [<a href="https://arxiv.org/pdf/2410.15042">pdf</a>, <a href="https://arxiv.org/format/2410.15042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Training: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mengnan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lihe Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jingwen Ye</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Baocai Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15042v1-abstract-short" style="display: inline;"> Adversarial training (AT) refers to integrating adversarial examples -- inputs altered with imperceptible perturbations that can significantly impact model predictions -- into the training process. Recent studies have demonstrated the effectiveness of AT in improving the robustness of deep neural networks against diverse adversarial attacks. However, a comprehensive overview of these developments… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15042v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15042v1-abstract-full" style="display: none;"> Adversarial training (AT) refers to integrating adversarial examples -- inputs altered with imperceptible perturbations that can significantly impact model predictions -- into the training process. Recent studies have demonstrated the effectiveness of AT in improving the robustness of deep neural networks against diverse adversarial attacks. However, a comprehensive overview of these developments is still missing. This survey addresses this gap by reviewing a broad range of recent and representative studies. Specifically, we first describe the implementation procedures and practical applications of AT, followed by a comprehensive review of AT techniques from three perspectives: data enhancement, network design, and training configurations. Lastly, we discuss common challenges in AT and propose several promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15042v1-abstract-full').style.display = 'none'; document.getElementById('2410.15042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13694">arXiv:2410.13694</a> <span> [<a href="https://arxiv.org/pdf/2410.13694">pdf</a>, <a href="https://arxiv.org/format/2410.13694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Design Space of Visual Context Representation in Video MLLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yifan Du</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijia Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoyu Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Han Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13694v1-abstract-short" style="display: inline;"> Video Multimodal Large Language Models (MLLMs) have shown remarkable capability of understanding the video semantics on various downstream tasks. Despite the advancements, there is still a lack of systematic research on visual context representation, which refers to the scheme to select frames from a video and further select the tokens from a frame. In this paper, we explore the design space for v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13694v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13694v1-abstract-full" style="display: none;"> Video Multimodal Large Language Models (MLLMs) have shown remarkable capability of understanding the video semantics on various downstream tasks. Despite the advancements, there is still a lack of systematic research on visual context representation, which refers to the scheme to select frames from a video and further select the tokens from a frame. In this paper, we explore the design space for visual context representation, and aim to improve the performance of video MLLMs by finding more effective representation schemes. Firstly, we formulate the task of visual context representation as a constrained optimization problem, and model the language modeling loss as a function of the number of frames and the number of embeddings (or tokens) per frame, given the maximum visual context window size. Then, we explore the scaling effects in frame selection and token selection respectively, and fit the corresponding function curve by conducting extensive empirical experiments. We examine the effectiveness of typical selection strategies and present empirical findings to determine the two factors. Furthermore, we study the joint effect of frame selection and token selection, and derive the optimal formula for determining the two factors. We demonstrate that the derived optimal settings show alignment with the best-performed results of empirical experiments. Our code and model are available at: https://github.com/RUCAIBox/Opt-Visor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13694v1-abstract-full').style.display = 'none'; document.getElementById('2410.13694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Long Video MLLM; work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13573">arXiv:2410.13573</a> <span> [<a href="https://arxiv.org/pdf/2410.13573">pdf</a>, <a href="https://arxiv.org/format/2410.13573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SPF-EMPC Planner: A real-time multi-robot trajectory planner for complex environments with uncertainties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pengming Zhu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiwen Zeng</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xuekai Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huimin Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13573v1-abstract-short" style="display: inline;"> In practical applications, the unpredictable movement of obstacles and the imprecise state observation of robots introduce significant uncertainties for the swarm of robots, especially in cluster environments. However, existing methods are difficult to realize safe navigation, considering uncertainties, complex environmental structures, and robot swarms. This paper introduces an extended state mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13573v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13573v1-abstract-full" style="display: none;"> In practical applications, the unpredictable movement of obstacles and the imprecise state observation of robots introduce significant uncertainties for the swarm of robots, especially in cluster environments. However, existing methods are difficult to realize safe navigation, considering uncertainties, complex environmental structures, and robot swarms. This paper introduces an extended state model predictive control planner with a safe probability field to address the multi-robot navigation problem in complex, dynamic, and uncertain environments. Initially, the safe probability field offers an innovative approach to model the uncertainty of external dynamic obstacles, combining it with an unconstrained optimization method to generate safe trajectories for multi-robot online. Subsequently, the extended state model predictive controller can accurately track these generated trajectories while considering the robots' inherent model constraints and state uncertainty, thus ensuring the practical feasibility of the planned trajectories. Simulation experiments show a success rate four times higher than that of state-of-the-art algorithms. Physical experiments demonstrate the method's ability to operate in real-time, enabling safe navigation for multi-robot in uncertain environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13573v1-abstract-full').style.display = 'none'; document.getElementById('2410.13573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12266">arXiv:2410.12266</a> <span> [<a href="https://arxiv.org/pdf/2410.12266">pdf</a>, <a href="https://arxiv.org/format/2410.12266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> FlashAudio: Rectified Flows for Fast and High-Fidelity Text-to-Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jialei Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12266v1-abstract-short" style="display: inline;"> Recent advancements in latent diffusion models (LDMs) have markedly enhanced text-to-audio generation, yet their iterative sampling processes impose substantial computational demands, limiting practical deployment. While recent methods utilizing consistency-based distillation aim to achieve few-step or single-step inference, their one-step performance is constrained by curved trajectories, prevent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12266v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12266v1-abstract-full" style="display: none;"> Recent advancements in latent diffusion models (LDMs) have markedly enhanced text-to-audio generation, yet their iterative sampling processes impose substantial computational demands, limiting practical deployment. While recent methods utilizing consistency-based distillation aim to achieve few-step or single-step inference, their one-step performance is constrained by curved trajectories, preventing them from surpassing traditional diffusion models. In this work, we introduce FlashAudio with rectified flows to learn straight flow for fast simulation. To alleviate the inefficient timesteps allocation and suboptimal distribution of noise, FlashAudio optimizes the time distribution of rectified flow with Bifocal Samplers and proposes immiscible flow to minimize the total distance of data-noise pairs in a batch vias assignment. Furthermore, to address the amplified accumulation error caused by the classifier-free guidance (CFG), we propose Anchored Optimization, which refines the guidance scale by anchoring it to a reference trajectory. Experimental results on text-to-audio generation demonstrate that FlashAudio's one-step generation performance surpasses the diffusion-based models with hundreds of sampling steps on audio quality and enables a sampling speed of 400x faster than real-time on a single NVIDIA 4090Ti GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12266v1-abstract-full').style.display = 'none'; document.getElementById('2410.12266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12100">arXiv:2410.12100</a> <span> [<a href="https://arxiv.org/pdf/2410.12100">pdf</a>, <a href="https://arxiv.org/format/2410.12100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Enhancing IoT Communication and Localization via Smarter Antenna </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianxiang Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haofan Lu</a>, <a href="/search/cs?searchtype=author&query=Abari%2C+O">Omid Abari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12100v2-abstract-short" style="display: inline;"> The convergence of sensing and communication functionalities is poised to become a pivotal feature of the sixth-generation (6G) wireless networks. This vision represents a paradigm shift in wireless network design, moving beyond mere communication to a holistic integration of sensing and communication capabilities, thereby further narrowing the gap between the physical and digital worlds. While In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12100v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12100v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12100v2-abstract-full" style="display: none;"> The convergence of sensing and communication functionalities is poised to become a pivotal feature of the sixth-generation (6G) wireless networks. This vision represents a paradigm shift in wireless network design, moving beyond mere communication to a holistic integration of sensing and communication capabilities, thereby further narrowing the gap between the physical and digital worlds. While Internet of Things (IoT) devices are integral to future wireless networks, their current capabilities in sensing and communication are constrained by their power and resource limitations. On one hand, their restricted power budget limits their transmission power, leading to reduced communication range and data rates. On the other hand, their limited hardware and processing abilities hinder the adoption of sophisticated sensing technologies, such as direction finding and localization. In this work, we introduce Wi-Pro, a system which seamlessly integrates today's WiFi protocol with smart antenna design to enhance the communication and sensing capabilities of existing IoT devices. This plug-and-play system can be easily installed by replacing the IoT device's antenna. Wi-Pro seamlessly integrates smart antenna hardware with current WiFi protocols, utilizing their inherent features to not only enhance communication but also to enable precise localization on low-cost IoT devices. Our evaluation results demonstrate that Wi-Pro achieves up to 150\% data rate improvement, up to five times range improvement, accurate direction finding, and localization on single-chain IoT devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12100v2-abstract-full').style.display = 'none'; document.getElementById('2410.12100v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE IoT Journal for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10912">arXiv:2410.10912</a> <span> [<a href="https://arxiv.org/pdf/2410.10912">pdf</a>, <a href="https://arxiv.org/format/2410.10912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> AlphaPruning: Using Heavy-Tailed Self Regularization Theory for Improved Layer-wise Pruning of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haiquan Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yefan Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaoqing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10912v1-abstract-short" style="display: inline;"> Recent work on pruning large language models (LLMs) has shown that one can eliminate a large number of parameters without compromising performance, making pruning a promising strategy to reduce LLM model size. Existing LLM pruning strategies typically assign uniform pruning ratios across layers, limiting overall pruning ability; and recent work on layerwise pruning of LLMs is often based on heuris… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10912v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10912v1-abstract-full" style="display: none;"> Recent work on pruning large language models (LLMs) has shown that one can eliminate a large number of parameters without compromising performance, making pruning a promising strategy to reduce LLM model size. Existing LLM pruning strategies typically assign uniform pruning ratios across layers, limiting overall pruning ability; and recent work on layerwise pruning of LLMs is often based on heuristics that can easily lead to suboptimal performance. In this paper, we leverage Heavy-Tailed Self-Regularization (HT-SR) Theory, in particular the shape of empirical spectral densities (ESDs) of weight matrices, to design improved layerwise pruning ratios for LLMs. Our analysis reveals a wide variability in how well-trained, and thus relatedly how prunable, different layers of an LLM are. Based on this, we propose AlphaPruning, which uses shape metrics to allocate layerwise sparsity ratios in a more theoretically principled manner. AlphaPruning can be used in conjunction with multiple existing LLM pruning methods. Our empirical results show that AlphaPruning prunes LLaMA-7B to 80% sparsity while maintaining reasonable perplexity, marking a first in the literature on LLMs. We have open-sourced our code at https://github.com/haiquanlu/AlphaPruning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10912v1-abstract-full').style.display = 'none'; document.getElementById('2410.10912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024, first two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10743">arXiv:2410.10743</a> <span> [<a href="https://arxiv.org/pdf/2410.10743">pdf</a>, <a href="https://arxiv.org/format/2410.10743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NT-LLM: A Novel Node Tokenizer for Integrating Graph Structure into Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yanbiao Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yue Ding</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dan Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mei Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wenqing Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongtao Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10743v1-abstract-short" style="display: inline;"> Graphs are a fundamental data structure for representing relationships in real-world scenarios. With the success of Large Language Models (LLMs) across various natural language processing (NLP) tasks, there has been growing interest in integrating LLMs for graph learning. However, applying LLMs to graph-related tasks poses significant challenges, as these models are not inherently designed to capt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10743v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10743v1-abstract-full" style="display: none;"> Graphs are a fundamental data structure for representing relationships in real-world scenarios. With the success of Large Language Models (LLMs) across various natural language processing (NLP) tasks, there has been growing interest in integrating LLMs for graph learning. However, applying LLMs to graph-related tasks poses significant challenges, as these models are not inherently designed to capture the complex structural information present in graphs. Existing approaches address this challenge through two strategies: the chain of tasks approach, which uses Graph Neural Networks (GNNs) to encode the graph structure so that LLMs are relieved from understanding spatial positions; and Graph-to-Text Conversion, which translates graph structures into semantic text representations that LLMs can process. Despite their progress, these methods often struggle to fully preserve the topological information of graphs or require extensive computational resources, limiting their practical applicability. In this work, we introduce Node Tokenizer for Large Language Models (NT-LLM), a novel framework that efficiently encodes graph structures by selecting key nodes as anchors and representing each node based on its relative distance to these anchors. This position-anchored encoding effectively captures the graph topology, enabling enhanced reasoning capabilities in LLMs over graph data. Additionally, we implement a task-specific tuning procedure to further improve structural understanding within LLMs. Through extensive empirical evaluations, NT-LLM demonstrates significant performance improvements across a variety of graph-related tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10743v1-abstract-full').style.display = 'none'; document.getElementById('2410.10743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10105">arXiv:2410.10105</a> <span> [<a href="https://arxiv.org/pdf/2410.10105">pdf</a>, <a href="https://arxiv.org/format/2410.10105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> High-Precision Dichotomous Image Segmentation via Probing Diffusion Capacity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng-Tao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinwei Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lihe Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10105v1-abstract-short" style="display: inline;"> In the realm of high-resolution (HR), fine-grained image segmentation, the primary challenge is balancing broad contextual awareness with the precision required for detailed object delineation, capturing intricate details and the finest edges of objects. Diffusion models, trained on vast datasets comprising billions of image-text pairs, such as SD V2.1, have revolutionized text-to-image synthesis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10105v1-abstract-full" style="display: none;"> In the realm of high-resolution (HR), fine-grained image segmentation, the primary challenge is balancing broad contextual awareness with the precision required for detailed object delineation, capturing intricate details and the finest edges of objects. Diffusion models, trained on vast datasets comprising billions of image-text pairs, such as SD V2.1, have revolutionized text-to-image synthesis by delivering exceptional quality, fine detail resolution, and strong contextual awareness, making them an attractive solution for high-resolution image segmentation. To this end, we propose DiffDIS, a diffusion-driven segmentation model that taps into the potential of the pre-trained U-Net within diffusion models, specifically designed for high-resolution, fine-grained object segmentation. By leveraging the robust generalization capabilities and rich, versatile image representation prior of the SD models, coupled with a task-specific stable one-step denoising approach, we significantly reduce the inference time while preserving high-fidelity, detailed generation. Additionally, we introduce an auxiliary edge generation task to not only enhance the preservation of fine details of the object boundaries, but reconcile the probabilistic nature of diffusion with the deterministic demands of segmentation. With these refined strategies in place, DiffDIS serves as a rapid object mask generation model, specifically optimized for generating detailed binary maps at high resolutions, while demonstrating impressive accuracy and swift processing. Experiments on the DIS5K dataset demonstrate the superiority of DiffDIS, achieving state-of-the-art results through a streamlined inference process. Our code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10105v1-abstract-full').style.display = 'none'; document.getElementById('2410.10105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10086">arXiv:2410.10086</a> <span> [<a href="https://arxiv.org/pdf/2410.10086">pdf</a>, <a href="https://arxiv.org/format/2410.10086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> VNF Migration with Fast Defragmentation: A GAT-Based Deep Learning Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuang Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hancheng Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengdi Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10086v1-abstract-short" style="display: inline;"> Network function virtualization (NFV) enhances service flexibility by decoupling network functions from dedicated hardware. To handle time-varying traffic in NFV network, virtualized network function (VNF) migration has been involved to dynamically adjust resource allocation. However, as network functions diversify, different resource types may be underutilized due to bottlenecks, which can be des… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10086v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10086v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10086v1-abstract-full" style="display: none;"> Network function virtualization (NFV) enhances service flexibility by decoupling network functions from dedicated hardware. To handle time-varying traffic in NFV network, virtualized network function (VNF) migration has been involved to dynamically adjust resource allocation. However, as network functions diversify, different resource types may be underutilized due to bottlenecks, which can be described as multidimensional resource fragmentation. To address this issue, we firstly define a metric to quantify resource fragmentation in NFV networks. Then, we propose a multi-hop graph attention network (MHGAT) model to effectively extract resource features from tailored network layers, which captures the overall network state and produces high-quality strategies rapidly. Building on this, we develop an MHGAT method to implement fast defragmentation and optimize VNF migration. Simulations demonstrate that by fast defragmentation, the MHGAT method improves the acceptance ratio by an average of 12.8%, reduces the overload ratio by an average of 30.6%, and lowers migration loss by an average of 43.3% compared to the state-of-art benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10086v1-abstract-full').style.display = 'none'; document.getElementById('2410.10086v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures, submitted to IEEE Transaction on Network and Service Management</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09416">arXiv:2410.09416</a> <span> [<a href="https://arxiv.org/pdf/2410.09416">pdf</a>, <a href="https://arxiv.org/format/2410.09416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Vision-Language Models Replace Human Annotators: A Case Study with CelebA Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoming Lu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+F">Feifei Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09416v1-abstract-short" style="display: inline;"> This study evaluates the capability of Vision-Language Models (VLMs) in image data annotation by comparing their performance on the CelebA dataset in terms of quality and cost-effectiveness against manual annotation. Annotations from the state-of-the-art LLaVA-NeXT model on 1000 CelebA images are in 79.5% agreement with the original human annotations. Incorporating re-annotations of disagreed case… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09416v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09416v1-abstract-full" style="display: none;"> This study evaluates the capability of Vision-Language Models (VLMs) in image data annotation by comparing their performance on the CelebA dataset in terms of quality and cost-effectiveness against manual annotation. Annotations from the state-of-the-art LLaVA-NeXT model on 1000 CelebA images are in 79.5% agreement with the original human annotations. Incorporating re-annotations of disagreed cases into a majority vote boosts AI annotation consistency to 89.1% and even higher for more objective labels. Cost assessments demonstrate that AI annotation significantly reduces expenditures compared to traditional manual methods -- representing less than 1% of the costs for manual annotation in the CelebA dataset. These findings support the potential of VLMs as a viable, cost-effective alternative for specific annotation tasks, reducing both financial burden and ethical concerns associated with large-scale manual data annotation. The AI annotations and re-annotations utilized in this study are available on https://github.com/evev2024/EVEV2024_CelebA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09416v1-abstract-full').style.display = 'none'; document.getElementById('2410.09416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024 Workshop (EvalEval 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09336">arXiv:2410.09336</a> <span> [<a href="https://arxiv.org/pdf/2410.09336">pdf</a>, <a href="https://arxiv.org/format/2410.09336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Novel Multi-Gait Strategy for Stable and Efficient Quadruped Robot Locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Daoxun Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhengyu Zhong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Ming Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhiqiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huimin Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09336v1-abstract-short" style="display: inline;"> Taking inspiration from the natural gait transition mechanism of quadrupeds, devising a good gait transition strategy is important for quadruped robots to achieve energy-efficient locomotion on various terrains and velocities. While previous studies have recognized that gait patterns linked to velocities impact two key factors, the Cost of Transport (CoT) and the stability of robot locomotion, onl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09336v1-abstract-full" style="display: none;"> Taking inspiration from the natural gait transition mechanism of quadrupeds, devising a good gait transition strategy is important for quadruped robots to achieve energy-efficient locomotion on various terrains and velocities. While previous studies have recognized that gait patterns linked to velocities impact two key factors, the Cost of Transport (CoT) and the stability of robot locomotion, only a limited number of studies have effectively combined these factors to design a mechanism that ensures both efficiency and stability in quadruped robot locomotion. In this paper, we propose a multi-gait selection and transition strategy to achieve stable and efficient locomotion across different terrains. Our strategy starts by establishing a gait mapping considering both CoT and locomotion stability to guide the gait selection process during locomotion. Then, we achieve gait switching in time by introducing affine transformations for gait parameters and a designed finite state machine to build the switching order. Comprehensive experiments have been conducted on using our strategy with changing terrains and velocities, and the results indicate that our proposed strategy outperforms baseline methods in achieving simultaneous efficiency in locomotion by considering CoT and stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09336v1-abstract-full').style.display = 'none'; document.getElementById('2410.09336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08035">arXiv:2410.08035</a> <span> [<a href="https://arxiv.org/pdf/2410.08035">pdf</a>, <a href="https://arxiv.org/format/2410.08035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IntrinsicVoice: Empowering LLMs with Intrinsic Real-time Voice Interaction Abilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaohong Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08035v2-abstract-short" style="display: inline;"> Current methods of building LLMs with voice interaction capabilities rely heavily on explicit text autoregressive generation before or during speech response generation to maintain content quality, which unfortunately brings computational overhead and increases latency in multi-turn interactions. To address this, we introduce IntrinsicVoic,e an LLM designed with intrinsic real-time voice interacti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08035v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08035v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08035v2-abstract-full" style="display: none;"> Current methods of building LLMs with voice interaction capabilities rely heavily on explicit text autoregressive generation before or during speech response generation to maintain content quality, which unfortunately brings computational overhead and increases latency in multi-turn interactions. To address this, we introduce IntrinsicVoic,e an LLM designed with intrinsic real-time voice interaction capabilities. IntrinsicVoice aims to facilitate the transfer of textual capabilities of pre-trained LLMs to the speech modality by mitigating the modality gap between text and speech. Our novelty architecture, GroupFormer, can reduce speech sequences to lengths comparable to text sequences while generating high-quality audio, significantly reducing the length difference between speech and text, speeding up inference, and alleviating long-text modeling issues. Additionally, we construct a multi-turn speech-to-speech dialogue dataset named \method-500k which includes nearly 500k turns of speech-to-speech dialogues, and a cross-modality training strategy to enhance the semantic alignment between speech and text. Experimental results demonstrate that IntrinsicVoice can generate high-quality speech response with latency lower than 100ms in multi-turn dialogue scenarios. Demos are available at https://instrinsicvoice.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08035v2-abstract-full').style.display = 'none'; document.getElementById('2410.08035v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07030">arXiv:2410.07030</a> <span> [<a href="https://arxiv.org/pdf/2410.07030">pdf</a>, <a href="https://arxiv.org/format/2410.07030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Clean Evaluations on Contaminated Visual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+S">Shujie Miao</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+W">Wai Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07030v1-abstract-short" style="display: inline;"> How to evaluate large language models (LLMs) cleanly has been established as an important research era to genuinely report the performance of possibly contaminated LLMs. Yet, how to cleanly evaluate the visual language models (VLMs) is an under-studied problem. We propose a novel approach to achieve such goals through data augmentation methods on the visual input information. We then craft a new v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07030v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07030v1-abstract-full" style="display: none;"> How to evaluate large language models (LLMs) cleanly has been established as an important research era to genuinely report the performance of possibly contaminated LLMs. Yet, how to cleanly evaluate the visual language models (VLMs) is an under-studied problem. We propose a novel approach to achieve such goals through data augmentation methods on the visual input information. We then craft a new visual clean evaluation benchmark with thousands of data instances. Through extensive experiments, we found that the traditional visual data augmentation methods are useful, but they are at risk of being used as a part of the training data as a workaround. We further propose using BGR augmentation to switch the colour channel of the visual information. We found that it is a simple yet effective method for reducing the effect of data contamination and fortunately, it is also harmful to be used as a data augmentation method during training. It means that it is hard to integrate such data augmentation into training by malicious trainers and it could be a promising technique to cleanly evaluate visual LLMs. Our code, data, and model weights will be released upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07030v1-abstract-full').style.display = 'none'; document.getElementById('2410.07030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06328">arXiv:2410.06328</a> <span> [<a href="https://arxiv.org/pdf/2410.06328">pdf</a>, <a href="https://arxiv.org/format/2410.06328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Auto-Evolve: Enhancing Large Language Model's Performance via Self-Reasoning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aswani%2C+K">Krishna Aswani</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huilin Lu</a>, <a href="/search/cs?searchtype=author&query=Patankar%2C+P">Pranav Patankar</a>, <a href="/search/cs?searchtype=author&query=Dhalwani%2C+P">Priya Dhalwani</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+I">Iris Tan</a>, <a href="/search/cs?searchtype=author&query=Ganeshmohan%2C+J">Jayant Ganeshmohan</a>, <a href="/search/cs?searchtype=author&query=Lacasse%2C+S">Simon Lacasse</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06328v2-abstract-short" style="display: inline;"> Recent advancements in prompt engineering strategies, such as Chain-of-Thought (CoT) and Self-Discover, have demonstrated significant potential in improving the reasoning abilities of Large Language Models (LLMs). However, these state-of-the-art (SOTA) prompting strategies rely on single or fixed set of static seed reasoning modules like "think step by step" or "break down this problem" intended t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06328v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06328v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06328v2-abstract-full" style="display: none;"> Recent advancements in prompt engineering strategies, such as Chain-of-Thought (CoT) and Self-Discover, have demonstrated significant potential in improving the reasoning abilities of Large Language Models (LLMs). However, these state-of-the-art (SOTA) prompting strategies rely on single or fixed set of static seed reasoning modules like "think step by step" or "break down this problem" intended to simulate human approach to problem-solving. This constraint limits the flexibility of models in tackling diverse problems effectively. In this paper, we introduce Auto-Evolve, a novel framework that enables LLMs to self-create dynamic reasoning modules and downstream action plan, resulting in significant improvements over current SOTA methods. We evaluate Auto-Evolve on the challenging BigBench-Hard (BBH) dataset with Claude 2.0, Claude 3 Sonnet, Mistral Large, and GPT 4, where it consistently outperforms the SOTA prompt strategies. Auto-Evolve outperforms CoT by up to 10.4% and on an average by 7% across these four models. Our framework introduces two innovations: a) Auto-Evolve dynamically generates reasoning modules for each task while aligning with human reasoning paradigm, thus eliminating the need for predefined templates. b) We introduce an iterative refinement component, that incrementally refines instruction guidance for LLMs and helps boost performance by average 2.8% compared to doing it in a single step. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06328v2-abstract-full').style.display = 'none'; document.getElementById('2410.06328v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05577">arXiv:2410.05577</a> <span> [<a href="https://arxiv.org/pdf/2410.05577">pdf</a>, <a href="https://arxiv.org/format/2410.05577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Underwater Object Detection in the Era of Artificial Intelligence: Current, Challenge, and Future </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuzhi Huang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Junyu Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Kwong%2C+S">Sam Kwong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huimin Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongyi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05577v1-abstract-short" style="display: inline;"> Underwater object detection (UOD), aiming to identify and localise the objects in underwater images or videos, presents significant challenges due to the optical distortion, water turbidity, and changing illumination in underwater scenes. In recent years, artificial intelligence (AI) based methods, especially deep learning methods, have shown promising performance in UOD. To further facilitate fut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05577v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05577v1-abstract-full" style="display: none;"> Underwater object detection (UOD), aiming to identify and localise the objects in underwater images or videos, presents significant challenges due to the optical distortion, water turbidity, and changing illumination in underwater scenes. In recent years, artificial intelligence (AI) based methods, especially deep learning methods, have shown promising performance in UOD. To further facilitate future advancements, we comprehensively study AI-based UOD. In this survey, we first categorise existing algorithms into traditional machine learning-based methods and deep learning-based methods, and summarise them by considering learning strategy, experimental dataset, utilised features or frameworks, and learning stage. Next, we discuss the potential challenges and suggest possible solutions and new directions. We also perform both quantitative and qualitative evaluations of mainstream algorithms across multiple benchmark datasets by considering the diverse and biased experimental setups. Finally, we introduce two off-the-shelf detection analysis tools, Diagnosis and TIDE, which well-examine the effects of object characteristics and various types of errors on detectors. These tools help identify the strengths and weaknesses of detectors, providing insigts for further improvement. The source codes, trained models, utilised datasets, detection results, and detection analysis tools are public available at \url{https://github.com/LongChenCV/UODReview}, and will be regularly updated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05577v1-abstract-full').style.display = 'none'; document.getElementById('2410.05577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04155">arXiv:2410.04155</a> <span> [<a href="https://arxiv.org/pdf/2410.04155">pdf</a>, <a href="https://arxiv.org/format/2410.04155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Toxic Subword Pruning for Dialogue Response Generation on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+W">Wai Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04155v1-abstract-short" style="display: inline;"> How to defend large language models (LLMs) from generating toxic content is an important research area. Yet, most research focused on various model training techniques to remediate LLMs by updating their weights. A typical related research area is safety alignment. This however is often costly and tedious and can expose the model to even more problems such as catastrophic forgetting if the trainin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04155v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04155v1-abstract-full" style="display: none;"> How to defend large language models (LLMs) from generating toxic content is an important research area. Yet, most research focused on various model training techniques to remediate LLMs by updating their weights. A typical related research area is safety alignment. This however is often costly and tedious and can expose the model to even more problems such as catastrophic forgetting if the trainings are not carefully handled by experienced NLP practitioners. We thus propose a simple yet effective and novel algorithm, namely \textbf{Tox}ic Subword \textbf{Prun}ing (ToxPrune) to prune the subword contained by the toxic words from BPE in trained LLMs. In contrast to the previous work that demonstrates pruning BPE tokens as harmful to the task of machine translation, we surprisingly found its usefulness in preventing toxic content from being generated on LLMs. Fortunately, our findings suggest that ToxPrune simultaneously improves the toxic language model NSFW-3B on the task of dialogue response generation obviously. We surprisingly found that ToxPrune can even obviously improve official Llama-3.1-6B in the metric of dialogue diversity. Extensive automatic results and human evaluation indicate that ToxPrune could be helpful for both remediating toxic LLMs and improving non-toxic LLMs on the task of dialogue response generation.\footnote{We plan to release the resources to facilitate future work.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04155v1-abstract-full').style.display = 'none'; document.getElementById('2410.04155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01289">arXiv:2410.01289</a> <span> [<a href="https://arxiv.org/pdf/2410.01289">pdf</a>, <a href="https://arxiv.org/format/2410.01289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> The Unlikely Hero: Nonideality in Analog Photonic Neural Networks as Built-in Defender Against Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haotian Lu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Ziang Yin</a>, <a href="/search/cs?searchtype=author&query=Bhoumik%2C+P">Partho Bhoumik</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+S">Sanmitra Banerjee</a>, <a href="/search/cs?searchtype=author&query=Chakrabarty%2C+K">Krishnendu Chakrabarty</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01289v1-abstract-short" style="display: inline;"> Electronic-photonic computing systems have emerged as a promising platform for accelerating deep neural network (DNN) workloads. Major efforts have been focused on countering hardware non-idealities and boosting efficiency with various hardware/algorithm co-design methods. However, the adversarial robustness of such photonic analog mixed-signal AI hardware remains unexplored. Though the hardware v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01289v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01289v1-abstract-full" style="display: none;"> Electronic-photonic computing systems have emerged as a promising platform for accelerating deep neural network (DNN) workloads. Major efforts have been focused on countering hardware non-idealities and boosting efficiency with various hardware/algorithm co-design methods. However, the adversarial robustness of such photonic analog mixed-signal AI hardware remains unexplored. Though the hardware variations can be mitigated with robustness-driven optimization methods, malicious attacks on the hardware show distinct behaviors from noises, which requires a customized protection method tailored to optical analog hardware. In this work, we rethink the role of conventionally undesired non-idealities in photonic analog accelerators and claim their surprising effects on defending against adversarial weight attacks. Inspired by the protection effects from DNN quantization and pruning, we propose a synergistic defense framework tailored for optical analog hardware that proactively protects sensitive weights via pre-attack unary weight encoding and post-attack vulnerability-aware weight locking. Efficiency-reliability trade-offs are formulated as constrained optimization problems and efficiently solved offline without model re-training costs. Extensive evaluation of various DNN benchmarks with a multi-core photonic accelerator shows that our framework maintains near-ideal on-chip inference accuracy under adversarial bit-flip attacks with merely <3% memory overhead. Our codes are open-sourced at https://github.com/ScopeX-ASU/Unlikely_Hero. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01289v1-abstract-full').style.display = 'none'; document.getElementById('2410.01289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages. Accepted to ACM/IEEE ASP-DAC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17564">arXiv:2409.17564</a> <span> [<a href="https://arxiv.org/pdf/2409.17564">pdf</a>, <a href="https://arxiv.org/format/2409.17564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> General Compression Framework for Efficient Transformer Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinglun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaixun Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hong Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17564v1-abstract-short" style="display: inline;"> Transformer-based trackers have established a dominant role in the field of visual object tracking. While these trackers exhibit promising performance, their deployment on resource-constrained devices remains challenging due to inefficiencies. To improve the inference efficiency and reduce the computation cost, prior approaches have aimed to either design lightweight trackers or distill knowledge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17564v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17564v1-abstract-full" style="display: none;"> Transformer-based trackers have established a dominant role in the field of visual object tracking. While these trackers exhibit promising performance, their deployment on resource-constrained devices remains challenging due to inefficiencies. To improve the inference efficiency and reduce the computation cost, prior approaches have aimed to either design lightweight trackers or distill knowledge from larger teacher models into more compact student trackers. However, these solutions often sacrifice accuracy for speed. Thus, we propose a general model compression framework for efficient transformer object tracking, named CompressTracker, to reduce the size of a pre-trained tracking model into a lightweight tracker with minimal performance degradation. Our approach features a novel stage division strategy that segments the transformer layers of the teacher model into distinct stages, enabling the student model to emulate each corresponding teacher stage more effectively. Additionally, we also design a unique replacement training technique that involves randomly substituting specific stages in the student model with those from the teacher model, as opposed to training the student model in isolation. Replacement training enhances the student model's ability to replicate the teacher model's behavior. To further forcing student model to emulate teacher model, we incorporate prediction guidance and stage-wise feature mimicking to provide additional supervision during the teacher model's compression process. Our framework CompressTracker is structurally agnostic, making it compatible with any transformer architecture. We conduct a series of experiment to verify the effectiveness and generalizability of CompressTracker. Our CompressTracker-4 with 4 transformer layers, which is compressed from OSTrack, retains about 96% performance on LaSOT (66.1% AUC) while achieves 2.17x speed up. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17564v1-abstract-full').style.display = 'none'; document.getElementById('2409.17564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17316">arXiv:2409.17316</a> <span> [<a href="https://arxiv.org/pdf/2409.17316">pdf</a>, <a href="https://arxiv.org/format/2409.17316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bi-TTA: Bidirectional Test-Time Adapter for Remote Physiological Measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haodong Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hao Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17316v1-abstract-short" style="display: inline;"> Remote photoplethysmography (rPPG) is gaining prominence for its non-invasive approach to monitoring physiological signals using only cameras. Despite its promise, the adaptability of rPPG models to new, unseen domains is hindered due to the environmental sensitivity of physiological signals. To address this, we pioneer the Test-Time Adaptation (TTA) in rPPG, enabling the adaptation of pre-trained… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17316v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17316v1-abstract-full" style="display: none;"> Remote photoplethysmography (rPPG) is gaining prominence for its non-invasive approach to monitoring physiological signals using only cameras. Despite its promise, the adaptability of rPPG models to new, unseen domains is hindered due to the environmental sensitivity of physiological signals. To address this, we pioneer the Test-Time Adaptation (TTA) in rPPG, enabling the adaptation of pre-trained models to the target domain during inference, sidestepping the need for annotations or source data due to privacy considerations. Particularly, utilizing only the user's face video stream as the accessible target domain data, the rPPG model is adjusted by tuning on each single instance it encounters. However, 1) TTA algorithms are designed predominantly for classification tasks, ill-suited in regression tasks such as rPPG due to inadequate supervision. 2) Tuning pre-trained models in a single-instance manner introduces variability and instability, posing challenges to effectively filtering domain-relevant from domain-irrelevant features while simultaneously preserving the learned information. To overcome these challenges, we present Bi-TTA, a novel expert knowledge-based Bidirectional Test-Time Adapter framework. Specifically, leveraging two expert-knowledge priors for providing self-supervision, our Bi-TTA primarily comprises two modules: a prospective adaptation (PA) module using sharpness-aware minimization to eliminate domain-irrelevant noise, enhancing the stability and efficacy during the adaptation process, and a retrospective stabilization (RS) module to dynamically reinforce crucial learned model parameters, averting performance degradation caused by overfitting or catastrophic forgetting. To this end, we established a large-scale benchmark for rPPG tasks under TTA protocol. The experimental results demonstrate the significant superiority of our approach over the state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17316v1-abstract-full').style.display = 'none'; document.getElementById('2409.17316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://bi-tta.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16876">arXiv:2409.16876</a> <span> [<a href="https://arxiv.org/pdf/2409.16876">pdf</a>, <a href="https://arxiv.org/format/2409.16876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automating Traffic Model Enhancement with AI Research Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xusen Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinxi Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+M">Mingxing Peng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongliang Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hai Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16876v2-abstract-short" style="display: inline;"> Developing efficient traffic models is essential for optimizing transportation systems, yet current approaches remain time-intensive and susceptible to human errors due to their reliance on manual processes. Traditional workflows involve exhaustive literature reviews, formula optimization, and iterative testing, leading to inefficiencies in research. In response, we introduce the Traffic Research… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16876v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16876v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16876v2-abstract-full" style="display: none;"> Developing efficient traffic models is essential for optimizing transportation systems, yet current approaches remain time-intensive and susceptible to human errors due to their reliance on manual processes. Traditional workflows involve exhaustive literature reviews, formula optimization, and iterative testing, leading to inefficiencies in research. In response, we introduce the Traffic Research Agent (TR-Agent), an AI-driven system designed to autonomously develop and refine traffic models through an iterative, closed-loop process. Specifically, we divide the research pipeline into four key stages: idea generation, theory formulation, theory evaluation, and iterative optimization; and construct TR-Agent with four corresponding modules: Idea Generator, Code Generator, Evaluator, and Analyzer. Working in synergy, these modules retrieve knowledge from external resources, generate novel ideas, implement and debug models, and finally assess them on the evaluation datasets. Furthermore, the system continuously refines these models based on iterative feedback, enhancing research efficiency and model performance. Experimental results demonstrate that TR-Agent achieves significant performance improvements across multiple traffic models, including the Intelligent Driver Model (IDM) for car following, the MOBIL lane-changing model, and the Lighthill-Whitham-Richards (LWR) traffic flow model. Additionally, TR-Agent provides detailed explanations for its optimizations, allowing researchers to verify and build upon its improvements easily. This flexibility makes the framework a powerful tool for researchers in transportation and beyond. To further support research and collaboration, we have open-sourced both the code and data used in our experiments, facilitating broader access and enabling continued advancements in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16876v2-abstract-full').style.display = 'none'; document.getElementById('2409.16876v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">52 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14401">arXiv:2409.14401</a> <span> [<a href="https://arxiv.org/pdf/2409.14401">pdf</a>, <a href="https://arxiv.org/format/2409.14401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Investigating the Impact of Hard Samples on Accuracy Reveals In-class Data Imbalance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pukowski%2C+P">Pawel Pukowski</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haiping Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14401v1-abstract-short" style="display: inline;"> In the AutoML domain, test accuracy is heralded as the quintessential metric for evaluating model efficacy, underpinning a wide array of applications from neural architecture search to hyperparameter optimization. However, the reliability of test accuracy as the primary performance metric has been called into question, notably through research highlighting how label noise can obscure the true rank… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14401v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14401v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14401v1-abstract-full" style="display: none;"> In the AutoML domain, test accuracy is heralded as the quintessential metric for evaluating model efficacy, underpinning a wide array of applications from neural architecture search to hyperparameter optimization. However, the reliability of test accuracy as the primary performance metric has been called into question, notably through research highlighting how label noise can obscure the true ranking of state-of-the-art models. We venture beyond, along another perspective where the existence of hard samples within datasets casts further doubt on the generalization capabilities inferred from test accuracy alone. Our investigation reveals that the distribution of hard samples between training and test sets affects the difficulty levels of those sets, thereby influencing the perceived generalization capability of models. We unveil two distinct generalization pathways-toward easy and hard samples-highlighting the complexity of achieving balanced model evaluation. Finally, we propose a benchmarking procedure for comparing hard sample identification methods, facilitating the advancement of more nuanced approaches in this area. Our primary goal is not to propose a definitive solution but to highlight the limitations of relying primarily on test accuracy as an evaluation metric, even when working with balanced datasets, by introducing the in-class data imbalance problem. By doing so, we aim to stimulate a critical discussion within the research community and open new avenues for research that consider a broader spectrum of model evaluation criteria. The anonymous code is available at https://github.com/PawPuk/CurvBIM blueunder the GPL-3.0 license. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14401v1-abstract-full').style.display = 'none'; document.getElementById('2409.14401v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to workshop track of AutoML'24 (see openreview)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14015">arXiv:2409.14015</a> <span> [<a href="https://arxiv.org/pdf/2409.14015">pdf</a>, <a href="https://arxiv.org/ps/2409.14015">ps</a>, <a href="https://arxiv.org/format/2409.14015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> The vertex-pancyclicity of the simplified shuffle-cube and the vertex-bipancyclicity of the balanced shuffle-cube </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yasong Liu</a>, <a href="/search/cs?searchtype=author&query=L%C3%BC%2C+H">Huazhong L眉</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14015v1-abstract-short" style="display: inline;"> A graph $G$ $=$ $(V,E)$ is vertex-pancyclic if for every vertex $u$ and any integer $l$ ranging from $3$ to $|V|$, $G$ contains a cycle $C$ of length $l$ such that $u$ is on $C$. A bipartite graph $G$ $=$ $(V,E)$ is vertex-bipancyclic if for every vertex $u$ and any even integer $l$ ranging from $4$ to $|V|$, $G$ contains a cycle $C$ of length $l$ such that $u$ is on $C$. The simplified shuffle-cu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14015v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14015v1-abstract-full" style="display: none;"> A graph $G$ $=$ $(V,E)$ is vertex-pancyclic if for every vertex $u$ and any integer $l$ ranging from $3$ to $|V|$, $G$ contains a cycle $C$ of length $l$ such that $u$ is on $C$. A bipartite graph $G$ $=$ $(V,E)$ is vertex-bipancyclic if for every vertex $u$ and any even integer $l$ ranging from $4$ to $|V|$, $G$ contains a cycle $C$ of length $l$ such that $u$ is on $C$. The simplified shuffle-cube and the balanced shuffle-cube, which are two variants of the shuffle-cube and are superior to shuffle-cube in terms of vertex-transitivity. In this paper, we show that the $n$-dimensional simplified shuffle-cube is vertex-pancyclic for $n\geqslant 6$, and the $n$-dimensional balanced shuffle-cube is vertex-bipancyclic for $n\geqslant 2$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14015v1-abstract-full').style.display = 'none'; document.getElementById('2409.14015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13477">arXiv:2409.13477</a> <span> [<a href="https://arxiv.org/pdf/2409.13477">pdf</a>, <a href="https://arxiv.org/format/2409.13477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> A Plug-and-Play Method for Guided Multi-contrast MRI Reconstruction based on Content/Style Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rao%2C+C">Chinmay Rao</a>, <a href="/search/cs?searchtype=author&query=van+Osch%2C+M">Matthias van Osch</a>, <a href="/search/cs?searchtype=author&query=Pezzotti%2C+N">Nicola Pezzotti</a>, <a href="/search/cs?searchtype=author&query=de+Bresser%2C+J">Jeroen de Bresser</a>, <a href="/search/cs?searchtype=author&query=Beljaards%2C+L">Laurens Beljaards</a>, <a href="/search/cs?searchtype=author&query=Meineke%2C+J">Jakob Meineke</a>, <a href="/search/cs?searchtype=author&query=de+Weerdt%2C+E">Elwin de Weerdt</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huangling Lu</a>, <a href="/search/cs?searchtype=author&query=Doneva%2C+M">Mariya Doneva</a>, <a href="/search/cs?searchtype=author&query=Staring%2C+M">Marius Staring</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13477v1-abstract-short" style="display: inline;"> Since multiple MRI contrasts of the same anatomy contain redundant information, one contrast can be used as a prior for guiding the reconstruction of an undersampled subsequent contrast. To this end, several learning-based guided reconstruction methods have been proposed. However, two key challenges remain - (a) the requirement of large paired training datasets and (b) the lack of intuitive unders… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13477v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13477v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13477v1-abstract-full" style="display: none;"> Since multiple MRI contrasts of the same anatomy contain redundant information, one contrast can be used as a prior for guiding the reconstruction of an undersampled subsequent contrast. To this end, several learning-based guided reconstruction methods have been proposed. However, two key challenges remain - (a) the requirement of large paired training datasets and (b) the lack of intuitive understanding of the model's internal representation and utilization of the shared information. We propose a modular two-stage approach for guided reconstruction, addressing these challenges. A content/style model of two-contrast image data is learned in a largely unpaired manner and is subsequently applied as a plug-and-play operator in iterative reconstruction. The disentanglement of content and style allows explicit representation of contrast-independent and contrast-specific factors. Based on this, incorporating prior information into the reconstruction reduces to simply replacing the aliased reconstruction content with clean content derived from the reference scan. We name this novel approach PnP-MUNIT. Various aspects like interpretability and convergence are explored via simulations. Furthermore, its practicality is demonstrated on the NYU fastMRI DICOM dataset and two in-house raw datasets, obtaining up to 32.6% more acceleration over learning-based non-guided reconstruction for a given SSIM. In a radiological task, PnP-MUNIT allowed 33.3% more acceleration over clinical reconstruction at diagnostic quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13477v1-abstract-full').style.display = 'none'; document.getElementById('2409.13477v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11650">arXiv:2409.11650</a> <span> [<a href="https://arxiv.org/pdf/2409.11650">pdf</a>, <a href="https://arxiv.org/format/2409.11650">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Art and Science of Quantizing Large-Scale Models: A Comprehensive Overview </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanshu Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiyan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hanning Lu</a>, <a href="/search/cs?searchtype=author&query=Zhe%2C+X">Xu Zhe</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoming Li</a>, <a href="/search/cs?searchtype=author&query=Weitao%2C+L">Li Weitao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11650v1-abstract-short" style="display: inline;"> This paper provides a comprehensive overview of the principles, challenges, and methodologies associated with quantizing large-scale neural network models. As neural networks have evolved towards larger and more complex architectures to address increasingly sophisticated tasks, the computational and energy costs have escalated significantly. We explore the necessity and impact of model size growth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11650v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11650v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11650v1-abstract-full" style="display: none;"> This paper provides a comprehensive overview of the principles, challenges, and methodologies associated with quantizing large-scale neural network models. As neural networks have evolved towards larger and more complex architectures to address increasingly sophisticated tasks, the computational and energy costs have escalated significantly. We explore the necessity and impact of model size growth, highlighting the performance benefits as well as the computational challenges and environmental considerations. The core focus is on model quantization as a fundamental approach to mitigate these challenges by reducing model size and improving efficiency without substantially compromising accuracy. We delve into various quantization techniques, including both post-training quantization (PTQ) and quantization-aware training (QAT), and analyze several state-of-the-art algorithms such as LLM-QAT, PEQA(L4Q), ZeroQuant, SmoothQuant, and others. Through comparative analysis, we examine how these methods address issues like outliers, importance weighting, and activation quantization, ultimately contributing to more sustainable and accessible deployment of large-scale models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11650v1-abstract-full').style.display = 'none'; document.getElementById('2409.11650v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10063">arXiv:2409.10063</a> <span> [<a href="https://arxiv.org/pdf/2409.10063">pdf</a>, <a href="https://arxiv.org/format/2409.10063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GlobalMapNet: An Online Framework for Vectorized Global HD Map Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+A">Anqi Shi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuze Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+J">Jian Pu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zeyu Fu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10063v2-abstract-short" style="display: inline;"> High-definition (HD) maps are essential for autonomous driving systems. Traditionally, an expensive and labor-intensive pipeline is implemented to construct HD maps, which is limited in scalability. In recent years, crowdsourcing and online mapping have emerged as two alternative methods, but they have limitations respectively. In this paper, we provide a novel methodology, namely global map const… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10063v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10063v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10063v2-abstract-full" style="display: none;"> High-definition (HD) maps are essential for autonomous driving systems. Traditionally, an expensive and labor-intensive pipeline is implemented to construct HD maps, which is limited in scalability. In recent years, crowdsourcing and online mapping have emerged as two alternative methods, but they have limitations respectively. In this paper, we provide a novel methodology, namely global map construction, to perform direct generation of vectorized global maps, combining the benefits of crowdsourcing and online mapping. We introduce GlobalMapNet, the first online framework for vectorized global HD map construction, which updates and utilizes a global map on the ego vehicle. To generate the global map from scratch, we propose GlobalMapBuilder to match and merge local maps continuously. We design a new algorithm, Map NMS, to remove duplicate map elements and produce a clean map. We also propose GlobalMapFusion to aggregate historical map information, improving consistency of prediction. We examine GlobalMapNet on two widely recognized datasets, Argoverse2 and nuScenes, showing that our framework is capable of generating globally consistent results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10063v2-abstract-full').style.display = 'none'; document.getElementById('2409.10063v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07829">arXiv:2409.07829</a> <span> [<a href="https://arxiv.org/pdf/2409.07829">pdf</a>, <a href="https://arxiv.org/format/2409.07829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Enabling Cost-Effective UI Automation Testing with Retrieval-Based LLMs: A Case Study in WeChat </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+S">Sidong Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haochuan Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jianqin Jiang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+T">Ting Xiong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Likun Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yinglin Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoqin Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuetang Deng</a>, <a href="/search/cs?searchtype=author&query=Aleti%2C+A">Aldeida Aleti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07829v1-abstract-short" style="display: inline;"> UI automation tests play a crucial role in ensuring the quality of mobile applications. Despite the growing popularity of machine learning techniques to generate these tests, they still face several challenges, such as the mismatch of UI elements. The recent advances in Large Language Models (LLMs) have addressed these issues by leveraging their semantic understanding capabilities. However, a sign… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07829v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07829v1-abstract-full" style="display: none;"> UI automation tests play a crucial role in ensuring the quality of mobile applications. Despite the growing popularity of machine learning techniques to generate these tests, they still face several challenges, such as the mismatch of UI elements. The recent advances in Large Language Models (LLMs) have addressed these issues by leveraging their semantic understanding capabilities. However, a significant gap remains in applying these models to industrial-level app testing, particularly in terms of cost optimization and knowledge limitation. To address this, we introduce CAT to create cost-effective UI automation tests for industry apps by combining machine learning and LLMs with best practices. Given the task description, CAT employs Retrieval Augmented Generation (RAG) to source examples of industrial app usage as the few-shot learning context, assisting LLMs in generating the specific sequence of actions. CAT then employs machine learning techniques, with LLMs serving as a complementary optimizer, to map the target element on the UI screen. Our evaluations on the WeChat testing dataset demonstrate the CAT's performance and cost-effectiveness, achieving 90% UI automation with $0.34 cost, outperforming the state-of-the-art. We have also integrated our approach into the real-world WeChat testing platform, demonstrating its usefulness in detecting 141 bugs and enhancing the developers' testing process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07829v1-abstract-full').style.display = 'none'; document.getElementById('2409.07829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06305">arXiv:2409.06305</a> <span> [<a href="https://arxiv.org/pdf/2409.06305">pdf</a>, <a href="https://arxiv.org/format/2409.06305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> High-Performance Few-Shot Segmentation with Foundation Models: An Empirical Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+S">Shijie Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lihe Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06305v1-abstract-short" style="display: inline;"> Existing few-shot segmentation (FSS) methods mainly focus on designing novel support-query matching and self-matching mechanisms to exploit implicit knowledge in pre-trained backbones. However, the performance of these methods is often constrained by models pre-trained on classification tasks. The exploration of what types of pre-trained models can provide more beneficial implicit knowledge for FS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06305v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06305v1-abstract-full" style="display: none;"> Existing few-shot segmentation (FSS) methods mainly focus on designing novel support-query matching and self-matching mechanisms to exploit implicit knowledge in pre-trained backbones. However, the performance of these methods is often constrained by models pre-trained on classification tasks. The exploration of what types of pre-trained models can provide more beneficial implicit knowledge for FSS remains limited. In this paper, inspired by the representation consistency of foundational computer vision models, we develop a FSS framework based on foundation models. To be specific, we propose a simple approach to extract implicit knowledge from foundation models to construct coarse correspondence and introduce a lightweight decoder to refine coarse correspondence for fine-grained segmentation. We systematically summarize the performance of various foundation models on FSS and discover that the implicit knowledge within some of these models is more beneficial for FSS than models pre-trained on classification tasks. Extensive experiments on two widely used datasets demonstrate the effectiveness of our approach in leveraging the implicit knowledge of foundation models. Notably, the combination of DINOv2 and DFN exceeds previous state-of-the-art methods by 17.5% on COCO-20i. Code is available at https://github.com/DUT-CSJ/FoundationFSS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06305v1-abstract-full').style.display = 'none'; document.getElementById('2409.06305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05921">arXiv:2409.05921</a> <span> [<a href="https://arxiv.org/pdf/2409.05921">pdf</a>, <a href="https://arxiv.org/format/2409.05921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> STLLM-DF: A Spatial-Temporal Large Language Model with Diffusion for Enhanced Multi-Mode Traffic System Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiqi Shao</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haoning Xi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haohui Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ze Wang</a>, <a href="/search/cs?searchtype=author&query=Bell%2C+M+G+H">Michael G. H. Bell</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junbin Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05921v1-abstract-short" style="display: inline;"> The rapid advancement of Intelligent Transportation Systems (ITS) presents challenges, particularly with missing data in multi-modal transportation and the complexity of handling diverse sequential tasks within a centralized framework. To address these issues, we propose the Spatial-Temporal Large Language Model Diffusion (STLLM-DF), an innovative model that leverages Denoising Diffusion Probabili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05921v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05921v1-abstract-full" style="display: none;"> The rapid advancement of Intelligent Transportation Systems (ITS) presents challenges, particularly with missing data in multi-modal transportation and the complexity of handling diverse sequential tasks within a centralized framework. To address these issues, we propose the Spatial-Temporal Large Language Model Diffusion (STLLM-DF), an innovative model that leverages Denoising Diffusion Probabilistic Models (DDPMs) and Large Language Models (LLMs) to improve multi-task transportation prediction. The DDPM's robust denoising capabilities enable it to recover underlying data patterns from noisy inputs, making it particularly effective in complex transportation systems. Meanwhile, the non-pretrained LLM dynamically adapts to spatial-temporal relationships within multi-modal networks, allowing the system to efficiently manage diverse transportation tasks in both long-term and short-term predictions. Extensive experiments demonstrate that STLLM-DF consistently outperforms existing models, achieving an average reduction of 2.40\% in MAE, 4.50\% in RMSE, and 1.51\% in MAPE. This model significantly advances centralized ITS by enhancing predictive accuracy, robustness, and overall system performance across multiple tasks, thus paving the way for more effective spatio-temporal traffic forecasting through the integration of frozen transformer language models and diffusion techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05921v1-abstract-full').style.display = 'none'; document.getElementById('2409.05921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> I.2.7 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05633">arXiv:2409.05633</a> <span> [<a href="https://arxiv.org/pdf/2409.05633">pdf</a>, <a href="https://arxiv.org/format/2409.05633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Graph Contrastive Learning with Reliable and Informative Augmentation for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bowen Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongyu Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Ming Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05633v1-abstract-short" style="display: inline;"> Graph neural network (GNN) has been a powerful approach in collaborative filtering (CF) due to its ability to model high-order user-item relationships. Recently, to alleviate the data sparsity and enhance representation learning, many efforts have been conducted to integrate contrastive learning (CL) with GNNs. Despite the promising improvements, the contrastive view generation based on structure… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05633v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05633v1-abstract-full" style="display: none;"> Graph neural network (GNN) has been a powerful approach in collaborative filtering (CF) due to its ability to model high-order user-item relationships. Recently, to alleviate the data sparsity and enhance representation learning, many efforts have been conducted to integrate contrastive learning (CL) with GNNs. Despite the promising improvements, the contrastive view generation based on structure and representation perturbations in existing methods potentially disrupts the collaborative information in contrastive views, resulting in limited effectiveness of positive alignment. To overcome this issue, we propose CoGCL, a novel framework that aims to enhance graph contrastive learning by constructing contrastive views with stronger collaborative information via discrete codes. The core idea is to map users and items into discrete codes rich in collaborative information for reliable and informative contrastive view generation. To this end, we initially introduce a multi-level vector quantizer in an end-to-end manner to quantize user and item representations into discrete codes. Based on these discrete codes, we enhance the collaborative information of contrastive views by considering neighborhood structure and semantic relevance respectively. For neighborhood structure, we propose virtual neighbor augmentation by treating discrete codes as virtual neighbors, which expands an observed user-item interaction into multiple edges involving discrete codes. Regarding semantic relevance, we identify similar users/items based on shared discrete codes and interaction targets to generate the semantically relevant view. Through these strategies, we construct contrastive views with stronger collaborative information and develop a triple-view graph contrastive learning approach. Extensive experiments on four public datasets demonstrate the effectiveness of our proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05633v1-abstract-full').style.display = 'none'; document.getElementById('2409.05633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00591">arXiv:2409.00591</a> <span> [<a href="https://arxiv.org/pdf/2409.00591">pdf</a>, <a href="https://arxiv.org/format/2409.00591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Attention-Guided Multi-scale Interaction Network for Face Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xujie Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+G">Guangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huimin Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chia-Wen Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00591v1-abstract-short" style="display: inline;"> Recently, CNN and Transformer hybrid networks demonstrated excellent performance in face super-resolution (FSR) tasks. Since numerous features at different scales in hybrid networks, how to fuse these multi-scale features and promote their complementarity is crucial for enhancing FSR. However, existing hybrid network-based FSR methods ignore this, only simply combining the Transformer and CNN. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00591v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00591v1-abstract-full" style="display: none;"> Recently, CNN and Transformer hybrid networks demonstrated excellent performance in face super-resolution (FSR) tasks. Since numerous features at different scales in hybrid networks, how to fuse these multi-scale features and promote their complementarity is crucial for enhancing FSR. However, existing hybrid network-based FSR methods ignore this, only simply combining the Transformer and CNN. To address this issue, we propose an attention-guided Multi-scale interaction network (AMINet), which contains local and global feature interactions as well as encoder-decoder phases feature interactions. Specifically, we propose a Local and Global Feature Interaction Module (LGFI) to promote fusions of global features and different receptive fields' local features extracted by our Residual Depth Feature Extraction Module (RDFE). Additionally, we propose a Selective Kernel Attention Fusion Module (SKAF) to adaptively select fusions of different features within LGFI and encoder-decoder phases. Our above design allows the free flow of multi-scale features from within modules and between encoder and decoder, which can promote the complementarity of different scale features to enhance FSR. Comprehensive experiments confirm that our method consistently performs well with less computational consumption and faster inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00591v1-abstract-full').style.display = 'none'; document.getElementById('2409.00591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00206">arXiv:2409.00206</a> <span> [<a href="https://arxiv.org/pdf/2409.00206">pdf</a>, <a href="https://arxiv.org/format/2409.00206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RING#: PR-by-PE Global Localization with Roto-translation Equivariant Gram Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+S">Sha Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuecheng Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haojian Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+R">Rong Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00206v2-abstract-short" style="display: inline;"> Global localization using onboard perception sensors, such as cameras and LiDARs, is crucial in autonomous driving and robotics applications when GPS signals are unreliable. Most approaches achieve global localization by sequential place recognition (PR) and pose estimation (PE). Some methods train separate models for each task, while others employ a single model with dual heads, trained jointly w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00206v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00206v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00206v2-abstract-full" style="display: none;"> Global localization using onboard perception sensors, such as cameras and LiDARs, is crucial in autonomous driving and robotics applications when GPS signals are unreliable. Most approaches achieve global localization by sequential place recognition (PR) and pose estimation (PE). Some methods train separate models for each task, while others employ a single model with dual heads, trained jointly with separate task-specific losses. However, the accuracy of localization heavily depends on the success of place recognition, which often fails in scenarios with significant changes in viewpoint or environmental appearance. Consequently, this renders the final pose estimation of localization ineffective. To address this, we introduce a new paradigm, PR-by-PE localization, which bypasses the need for separate place recognition by directly deriving it from pose estimation. We propose RING#, an end-to-end PR-by-PE localization network that operates in the bird's-eye-view (BEV) space, compatible with both vision and LiDAR sensors. RING# incorporates a novel design that learns two equivariant representations from BEV features, enabling globally convergent and computationally efficient pose estimation. Comprehensive experiments on the NCLT and Oxford datasets show that RING# outperforms state-of-the-art methods in both vision and LiDAR modalities, validating the effectiveness of the proposed approach. The code will be publicly released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00206v2-abstract-full').style.display = 'none'; document.getElementById('2409.00206v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 21 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15663">arXiv:2408.15663</a> <span> [<a href="https://arxiv.org/pdf/2408.15663">pdf</a>, <a href="https://arxiv.org/format/2408.15663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> NeuroVE: Brain-inspired Linear-Angular Velocity Estimation with Spiking Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruibin Guo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yujie Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zongtan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fangwen Yu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huimin Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15663v1-abstract-short" style="display: inline;"> Vision-based ego-velocity estimation is a fundamental problem in robot state estimation. However, the constraints of frame-based cameras, including motion blur and insufficient frame rates in dynamic settings, readily lead to the failure of conventional velocity estimation techniques. Mammals exhibit a remarkable ability to accurately estimate their ego-velocity during aggressive movement. Hence,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15663v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15663v1-abstract-full" style="display: none;"> Vision-based ego-velocity estimation is a fundamental problem in robot state estimation. However, the constraints of frame-based cameras, including motion blur and insufficient frame rates in dynamic settings, readily lead to the failure of conventional velocity estimation techniques. Mammals exhibit a remarkable ability to accurately estimate their ego-velocity during aggressive movement. Hence, integrating this capability into robots shows great promise for addressing these challenges. In this paper, we propose a brain-inspired framework for linear-angular velocity estimation, dubbed NeuroVE. The NeuroVE framework employs an event camera to capture the motion information and implements spiking neural networks (SNNs) to simulate the brain's spatial cells' function for velocity estimation. We formulate the velocity estimation as a time-series forecasting problem. To this end, we design an Astrocyte Leaky Integrate-and-Fire (ALIF) neuron model to encode continuous values. Additionally, we have developed an Astrocyte Spiking Long Short-term Memory (ASLSTM) structure, which significantly improves the time-series forecasting capabilities, enabling an accurate estimate of ego-velocity. Results from both simulation and real-world experiments indicate that NeuroVE has achieved an approximate 60% increase in accuracy compared to other SNN-based approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15663v1-abstract-full').style.display = 'none'; document.getElementById('2408.15663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15609">arXiv:2408.15609</a> <span> [<a href="https://arxiv.org/pdf/2408.15609">pdf</a>, <a href="https://arxiv.org/format/2408.15609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Statistical QoS Provision in Business-Centric Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chang Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuang Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hancheng Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15609v1-abstract-short" style="display: inline;"> More refined resource management and Quality of Service (QoS) provisioning is a critical goal of wireless communication technologies. In this paper, we propose a novel Business-Centric Network (BCN) aimed at enabling scalable QoS provisioning, based on a cross-layer framework that captures the relationship between application, transport parameters, and channels. We investigate both continuous flow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15609v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15609v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15609v1-abstract-full" style="display: none;"> More refined resource management and Quality of Service (QoS) provisioning is a critical goal of wireless communication technologies. In this paper, we propose a novel Business-Centric Network (BCN) aimed at enabling scalable QoS provisioning, based on a cross-layer framework that captures the relationship between application, transport parameters, and channels. We investigate both continuous flow and event-driven flow models, presenting key QoS metrics such as throughput, delay, and reliability. By jointly considering power and bandwidth allocation, transmission parameters, and AP network topology across layers, we optimize weighted resource efficiency with statistical QoS provisioning. To address the coupling among parameters, we propose a novel deep reinforcement learning (DRL) framework, which is Collaborative Optimization among Heterogeneous Actors with Experience Sharing (COHA-ES). Power and sub-channel (SC) Actors representing multiple APs are jointly optimized under the unified guidance of a common critic. Additionally, we introduce a novel multithreaded experience-sharing mechanism to accelerate training and enhance rewards. Extensive comparative experiments validate the effectiveness of our DRL framework in terms of convergence and efficiency. Moreover, comparative analyses demonstrate the comprehensive advantages of the BCN structure in enhancing both spectral and energy efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15609v1-abstract-full').style.display = 'none'; document.getElementById('2408.15609v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14806">arXiv:2408.14806</a> <span> [<a href="https://arxiv.org/pdf/2408.14806">pdf</a>, <a href="https://arxiv.org/format/2408.14806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Poly2Vec: Polymorphic Encoding of Geospatial Objects for Spatial Reasoning with Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Siampou%2C+M+D">Maria Despoina Siampou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialiang Li</a>, <a href="/search/cs?searchtype=author&query=Krumm%2C+J">John Krumm</a>, <a href="/search/cs?searchtype=author&query=Shahabi%2C+C">Cyrus Shahabi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hua Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14806v1-abstract-short" style="display: inline;"> Encoding geospatial data is crucial for enabling machine learning (ML) models to perform tasks that require spatial reasoning, such as identifying the topological relationships between two different geospatial objects. However, existing encoding methods are limited as they are typically customized to handle only specific types of spatial data, which impedes their applicability across different dow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14806v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14806v1-abstract-full" style="display: none;"> Encoding geospatial data is crucial for enabling machine learning (ML) models to perform tasks that require spatial reasoning, such as identifying the topological relationships between two different geospatial objects. However, existing encoding methods are limited as they are typically customized to handle only specific types of spatial data, which impedes their applicability across different downstream tasks where multiple data types coexist. To address this, we introduce Poly2Vec, an encoding framework that unifies the modeling of different geospatial objects, including 2D points, polylines, and polygons, irrespective of the downstream task. We leverage the power of the 2D Fourier transform to encode useful spatial properties, such as shape and location, from geospatial objects into fixed-length vectors. These vectors are then inputted into neural network models for spatial reasoning tasks.This unified approach eliminates the need to develop and train separate models for each distinct spatial type. We evaluate Poly2Vec on both synthetic and real datasets of mixed geometry types and verify its consistent performance across several downstream spatial reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14806v1-abstract-full').style.display = 'none'; document.getElementById('2408.14806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lu%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lu%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository