CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 104 results for author: <span class="mathjax">Lan, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Lan%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lan, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lan%2C+X&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lan, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00833">arXiv:2412.00833</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00833">pdf</a>, <a href="https://arxiv.org/format/2412.00833">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AlignMamba: Enhancing Multimodal Mamba with Local and Global Cross-modal Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Y">Yifei Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Haifeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00833v1-abstract-short" style="display: inline;"> Cross-modal alignment is crucial for multimodal representation fusion due to the inherent heterogeneity between modalities. While Transformer-based methods have shown promising results in modeling inter-modal relationships, their quadratic computational complexity limits their applicability to long-sequence or large-scale data. Although recent Mamba-based approaches achieve linear complexity, thei&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00833v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00833v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00833v1-abstract-full" style="display: none;"> Cross-modal alignment is crucial for multimodal representation fusion due to the inherent heterogeneity between modalities. While Transformer-based methods have shown promising results in modeling inter-modal relationships, their quadratic computational complexity limits their applicability to long-sequence or large-scale data. Although recent Mamba-based approaches achieve linear complexity, their sequential scanning mechanism poses fundamental challenges in comprehensively modeling cross-modal relationships. To address this limitation, we propose AlignMamba, an efficient and effective method for multimodal fusion. Specifically, grounded in Optimal Transport, we introduce a local cross-modal alignment module that explicitly learns token-level correspondences between different modalities. Moreover, we propose a global cross-modal alignment loss based on Maximum Mean Discrepancy to implicitly enforce the consistency between different modal distributions. Finally, the unimodal representations after local and global alignment are passed to the Mamba backbone for further cross-modal interaction and multimodal fusion. Extensive experiments on complete and incomplete multimodal fusion tasks demonstrate the effectiveness and efficiency of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00833v1-abstract-full').style.display = 'none'; document.getElementById('2412.00833v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18211">arXiv:2411.18211</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18211">pdf</a>, <a href="https://arxiv.org/format/2411.18211">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TimeMarker: A Versatile Video-LLM for Long and Short Video Understanding with Superior Temporal Localization Ability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shimin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaohan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yitian Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Jie%2C+Z">Zequn Jie</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18211v1-abstract-short" style="display: inline;"> Rapid development of large language models (LLMs) has significantly advanced multimodal large language models (LMMs), particularly in vision-language tasks. However, existing video-language models often overlook precise temporal localization and struggle with videos of varying lengths. We introduce TimeMarker, a versatile Video-LLM designed for high-quality dialogue based on video content, emphasi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18211v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18211v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18211v1-abstract-full" style="display: none;"> Rapid development of large language models (LLMs) has significantly advanced multimodal large language models (LMMs), particularly in vision-language tasks. However, existing video-language models often overlook precise temporal localization and struggle with videos of varying lengths. We introduce TimeMarker, a versatile Video-LLM designed for high-quality dialogue based on video content, emphasizing temporal localization. TimeMarker integrates Temporal Separator Tokens to enhance temporal awareness, accurately marking specific moments within videos. It employs the AnyLength mechanism for dynamic frame sampling and adaptive token merging, enabling effective handling of both short and long videos. Additionally, TimeMarker utilizes diverse datasets, including further transformed temporal-related video QA datasets, to bolster its temporal understanding capabilities. Image and interleaved data are also employed to further enhance the model&#39;s semantic perception ability. Evaluations demonstrate that TimeMarker achieves state-of-the-art performance across multiple benchmarks, excelling in both short and long video categories. Our project page is at \url{https://github.com/TimeMarker-LLM/TimeMarker/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18211v1-abstract-full').style.display = 'none'; document.getElementById('2411.18211v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17274">arXiv:2411.17274</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17274">pdf</a>, <a href="https://arxiv.org/format/2411.17274">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> CleanVul: Automatic Function-Level Vulnerability Detection in Code Commits Using LLM Heuristics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yikun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Ting Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Widyasari%2C+R">Ratnadira Widyasari</a>, <a href="/search/cs?searchtype=author&amp;query=Tun%2C+Y+N">Yan Naing Tun</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H+H">Huu Hung Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+T">Tan Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Irsan%2C+I+C">Ivana Clairine Irsan</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yiran Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Ang%2C+H+W">Han Wei Ang</a>, <a href="/search/cs?searchtype=author&amp;query=Liauw%2C+F">Frank Liauw</a>, <a href="/search/cs?searchtype=author&amp;query=Weyssow%2C+M">Martin Weyssow</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+H+J">Hong Jin Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Ouh%2C+E+L">Eng Lieh Ouh</a>, <a href="/search/cs?searchtype=author&amp;query=Shar%2C+L+K">Lwin Khin Shar</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+D">David Lo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17274v2-abstract-short" style="display: inline;"> Accurate identification of software vulnerabilities is crucial for system integrity. Vulnerability datasets, often derived from the National Vulnerability Database (NVD) or directly from GitHub, are essential for training machine learning models to detect these security flaws. However, these datasets frequently suffer from significant noise, typically 40% to 75%, due primarily to the automatic and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17274v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17274v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17274v2-abstract-full" style="display: none;"> Accurate identification of software vulnerabilities is crucial for system integrity. Vulnerability datasets, often derived from the National Vulnerability Database (NVD) or directly from GitHub, are essential for training machine learning models to detect these security flaws. However, these datasets frequently suffer from significant noise, typically 40% to 75%, due primarily to the automatic and indiscriminate labeling of all changes in vulnerability-fixing commits (VFCs) as vulnerability-related. This misclassification occurs because not all changes in a commit aimed at fixing vulnerabilities pertain to security threats; many are routine updates like bug fixes or test improvements. This paper introduces the first methodology that uses the Large Language Model (LLM) with a heuristic enhancement to automatically identify vulnerability-fixing changes from VFCs, achieving an F1-score of 0.82. VulSifter was applied to a large-scale study, where we conducted a crawl of 127,063 repositories on GitHub, resulting in the acquisition of 5,352,105 commits. VulSifter involves utilizing an LLM to comprehend code semantics and contextual information, while applying heuristics to filter out unrelated changes. We then developed CleanVul, a high-quality dataset comprising 11,632 functions using our LLM heuristic enhancement approach, demonstrating Correctness (90.6%) comparable to established datasets such as SVEN and PrimeVul. To evaluate the CleanVul dataset, we conducted experiments focusing on fine-tuning various LLMs on CleanVul and other high-quality datasets. Evaluation results reveal that LLMs fine-tuned on CleanVul not only exhibit enhanced accuracy but also superior generalization capabilities compared to those trained on uncleaned datasets. Specifically, models trained on CleanVul and tested on PrimeVul achieve accuracy higher than those trained and tested exclusively on PrimeVul. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17274v2-abstract-full').style.display = 'none'; document.getElementById('2411.17274v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17157">arXiv:2411.17157</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17157">pdf</a>, <a href="https://arxiv.org/format/2411.17157">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Impact of Anthropomorphism in Role-Playing AI Chatbots on Media Dependency: A Case Study of Xuanhe AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Q">Qiufang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xingyu Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17157v1-abstract-short" style="display: inline;"> Powered by large language models, the conversational capabilities of AI have seen significant improvements. In this context, a series of role-playing AI chatbots have emerged, exhibiting a strong tendency toward anthropomorphism, such as conversing like humans, possessing personalities, and fulfilling social and companionship functions. Informed by media dependency theory in communication studies,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17157v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17157v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17157v1-abstract-full" style="display: none;"> Powered by large language models, the conversational capabilities of AI have seen significant improvements. In this context, a series of role-playing AI chatbots have emerged, exhibiting a strong tendency toward anthropomorphism, such as conversing like humans, possessing personalities, and fulfilling social and companionship functions. Informed by media dependency theory in communication studies, this work hypothesizes that a higher level of anthropomorphism of the role-playing chatbots will increase users&#39; media dependency (i.e., people will depend on media that meets their needs and goals). Specifically, we conducted a user study on a Chinese role-playing chatbot platform, Xuanhe AI, selecting four representative chatbots as research targets. We invited 149 users to interact with these chatbots over a period. A questionnaire survey revealed a significant positive correlation between the degree of anthropomorphism in role-playing chatbots and users&#39; media dependency, with user satisfaction mediating this relationship. Next, based on the quantitative results, we conducted semi-structured interviews with ten users to further understand the factors that deterred them from depending on anthropomorphic chatbots. In conclusion, this work has provided empirical insights for the design of role-playing AI chatbots and deepened the understanding of how users engage with conversational AI over a longer period. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17157v1-abstract-full').style.display = 'none'; document.getElementById('2411.17157v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16167">arXiv:2411.16167</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16167">pdf</a>, <a href="https://arxiv.org/format/2411.16167">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BadSFL: Backdoor Attack against Scaffold Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xingshuo Han</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xuanye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haozhao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+S">Shengmin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+S">Shen Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+J">Jason Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+M">Ming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Heinrich%2C+M">Michael Heinrich</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tianwei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16167v2-abstract-short" style="display: inline;"> Federated learning (FL) enables the training of deep learning models on distributed clients to preserve data privacy. However, this learning paradigm is vulnerable to backdoor attacks, where malicious clients can upload poisoned local models to embed backdoors into the global model, leading to attacker-desired predictions. Existing backdoor attacks mainly focus on FL with independently and identic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16167v2-abstract-full').style.display = 'inline'; document.getElementById('2411.16167v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16167v2-abstract-full" style="display: none;"> Federated learning (FL) enables the training of deep learning models on distributed clients to preserve data privacy. However, this learning paradigm is vulnerable to backdoor attacks, where malicious clients can upload poisoned local models to embed backdoors into the global model, leading to attacker-desired predictions. Existing backdoor attacks mainly focus on FL with independently and identically distributed (IID) scenarios, while real-world FL training data are typically non-IID. Current strategies for non-IID backdoor attacks suffer from limitations in maintaining effectiveness and durability. To address these challenges, we propose a novel backdoor attack method, BadSFL, specifically designed for the FL framework using the scaffold aggregation algorithm in non-IID settings. BadSFL leverages a Generative Adversarial Network (GAN) based on the global model to complement the training set, achieving high accuracy on both backdoor and benign samples. It utilizes a specific feature as the backdoor trigger to ensure stealthiness, and exploits the Scaffold&#39;s control variate to predict the global model&#39;s convergence direction, ensuring the backdoor&#39;s persistence. Extensive experiments on three benchmark datasets demonstrate the high effectiveness, stealthiness, and durability of BadSFL. Notably, our attack remains effective over 60 rounds in the global model and up to 3 times longer than existing baseline attacks after stopping the injection of malicious updates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16167v2-abstract-full').style.display = 'none'; document.getElementById('2411.16167v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13183">arXiv:2411.13183</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13183">pdf</a>, <a href="https://arxiv.org/format/2411.13183">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ClickTrack: Towards Real-time Interactive Single Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kuiran Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xuehui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenwen Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guorong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Q">Qixiang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Jiao%2C+J">Jianbin Jiao</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zhenjun Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13183v2-abstract-short" style="display: inline;"> Single object tracking(SOT) relies on precise object bounding box initialization. In this paper, we reconsidered the deficiencies in the current approaches to initializing single object trackers and propose a new paradigm for single object tracking algorithms, ClickTrack, a new paradigm using clicking interaction for real-time scenarios. Moreover, click as an input type inherently lack hierarchica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13183v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13183v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13183v2-abstract-full" style="display: none;"> Single object tracking(SOT) relies on precise object bounding box initialization. In this paper, we reconsidered the deficiencies in the current approaches to initializing single object trackers and propose a new paradigm for single object tracking algorithms, ClickTrack, a new paradigm using clicking interaction for real-time scenarios. Moreover, click as an input type inherently lack hierarchical information. To address ambiguity in certain special scenarios, we designed the Guided Click Refiner(GCR), which accepts point and optional textual information as inputs, transforming the point into the bounding box expected by the operator. The bounding box will be used as input of single object trackers. Experiments on LaSOT and GOT-10k benchmarks show that tracker combined with GCR achieves stable performance in real-time interactive scenarios. Furthermore, we explored the integration of GCR into the Segment Anything model(SAM), significantly reducing ambiguity issues when SAM receives point inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13183v2-abstract-full').style.display = 'none'; document.getElementById('2411.13183v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11327">arXiv:2411.11327</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11327">pdf</a>, <a href="https://arxiv.org/format/2411.11327">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Decision Transformer with Diffusion-Based Trajectory Branch Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhihong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+L">Long Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+L">Lipeng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11327v1-abstract-short" style="display: inline;"> Decision Transformer (DT) can learn effective policy from offline datasets by converting the offline reinforcement learning (RL) into a supervised sequence modeling task, where the trajectory elements are generated auto-regressively conditioned on the return-to-go (RTG).However, the sequence modeling learning approach tends to learn policies that converge on the sub-optimal trajectories within the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11327v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11327v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11327v1-abstract-full" style="display: none;"> Decision Transformer (DT) can learn effective policy from offline datasets by converting the offline reinforcement learning (RL) into a supervised sequence modeling task, where the trajectory elements are generated auto-regressively conditioned on the return-to-go (RTG).However, the sequence modeling learning approach tends to learn policies that converge on the sub-optimal trajectories within the dataset, for lack of bridging data to move to better trajectories, even if the condition is set to the highest RTG.To address this issue, we introduce Diffusion-Based Trajectory Branch Generation (BG), which expands the trajectories of the dataset with branches generated by a diffusion model.The trajectory branch is generated based on the segment of the trajectory within the dataset, and leads to trajectories with higher returns.We concatenate the generated branch with the trajectory segment as an expansion of the trajectory.After expanding, DT has more opportunities to learn policies to move to better trajectories, preventing it from converging to the sub-optimal trajectories.Empirically, after processing with BG, DT outperforms state-of-the-art sequence modeling methods on D4RL benchmark, demonstrating the effectiveness of adding branches to the dataset without further modifications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11327v1-abstract-full').style.display = 'none'; document.getElementById('2411.11327v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11417">arXiv:2410.11417</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11417">pdf</a>, <a href="https://arxiv.org/format/2410.11417">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> VidCompress: Memory-Enhanced Temporal Compression for Video Understanding in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaohan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yitian Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Jie%2C+Z">Zequn Jie</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11417v1-abstract-short" style="display: inline;"> Video-based multimodal large language models (Video-LLMs) possess significant potential for video understanding tasks. However, most Video-LLMs treat videos as a sequential set of individual frames, which results in insufficient temporal-spatial interaction that hinders fine-grained comprehension and difficulty in processing longer videos due to limited visual token capacity. To address these chal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11417v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11417v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11417v1-abstract-full" style="display: none;"> Video-based multimodal large language models (Video-LLMs) possess significant potential for video understanding tasks. However, most Video-LLMs treat videos as a sequential set of individual frames, which results in insufficient temporal-spatial interaction that hinders fine-grained comprehension and difficulty in processing longer videos due to limited visual token capacity. To address these challenges, we propose VidCompress, a novel Video-LLM featuring memory-enhanced temporal compression. VidCompress employs a dual-compressor approach: a memory-enhanced compressor captures both short-term and long-term temporal relationships in videos and compresses the visual tokens using a multiscale transformer with a memory-cache mechanism, while a text-perceived compressor generates condensed visual tokens by utilizing Q-Former and integrating temporal contexts into query embeddings with cross attention. Experiments on several VideoQA datasets and comprehensive benchmarks demonstrate that VidCompress efficiently models complex temporal-spatial relations and significantly outperforms existing Video-LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11417v1-abstract-full').style.display = 'none'; document.getElementById('2410.11417v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10370">arXiv:2410.10370</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10370">pdf</a>, <a href="https://arxiv.org/format/2410.10370">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Innovative Thinking, Infinite Humor: Humor Research of Large Language Models through Structured Thought Leaps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaohan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Gang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10370v1-abstract-short" style="display: inline;"> Humor is a culturally nuanced aspect of human language that presents challenges for understanding and generation, requiring participants to possess good creativity and strong associative thinking. Similar to reasoning tasks like solving math problems, humor generation requires continuous reflection and revision to foster creative thinking, rather than relying on a sudden flash of inspiration like&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10370v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10370v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10370v1-abstract-full" style="display: none;"> Humor is a culturally nuanced aspect of human language that presents challenges for understanding and generation, requiring participants to possess good creativity and strong associative thinking. Similar to reasoning tasks like solving math problems, humor generation requires continuous reflection and revision to foster creative thinking, rather than relying on a sudden flash of inspiration like Creative Leap-of-Thought (CLoT) paradigm. Although CLoT can realize the ability of remote association generation, this paradigm fails to generate humor content. Therefore, in this paper, we propose a systematic way of thinking about generating humor and based on it, we built Creative Leap of Structured Thought (CLoST) frame. First, a reward model is necessary achieve the purpose of being able to correct errors, since there is currently no expert model of humor and a usable rule to determine whether a piece of content is humorous. Judgement-oriented instructions are designed to improve the capability of a model, and we also propose an open-domain instruction evolutionary method to fully unleash the potential. Then, through reinforcement learning, the model learns to hone its rationales of the thought chain and refine the strategies it uses. Thus, it learns to recognize and correct its mistakes, and finally generate the most humorous and creative answer. These findings deepen our understanding of the creative capabilities of LLMs and provide ways to enhance LLMs&#39; creative abilities for cross-domain innovative applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10370v1-abstract-full').style.display = 'none'; document.getElementById('2410.10370v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09431">arXiv:2410.09431</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09431">pdf</a>, <a href="https://arxiv.org/format/2410.09431">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> REGNet V2: End-to-End REgion-based Grasp Detection Network for Grippers of Different Sizes in Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+B">Binglei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chengzhong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiayuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09431v1-abstract-short" style="display: inline;"> Grasping has been a crucial but challenging problem in robotics for many years. One of the most important challenges is how to make grasping generalizable and robust to novel objects as well as grippers in unstructured environments. We present \regnet, a robotic grasping system that can adapt to different parallel jaws to grasp diversified objects. To support different grippers, \regnet embeds the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09431v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09431v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09431v1-abstract-full" style="display: none;"> Grasping has been a crucial but challenging problem in robotics for many years. One of the most important challenges is how to make grasping generalizable and robust to novel objects as well as grippers in unstructured environments. We present \regnet, a robotic grasping system that can adapt to different parallel jaws to grasp diversified objects. To support different grippers, \regnet embeds the gripper parameters into point clouds, based on which it predicts suitable grasp configurations. It includes three components: Score Network (SN), Grasp Region Network (GRN), and Refine Network (RN). In the first stage, SN is used to filter suitable points for grasping by grasp confidence scores. In the second stage, based on the selected points, GRN generates a set of grasp proposals. Finally, RN refines the grasp proposals for more accurate and robust predictions. We devise an analytic policy to choose the optimal grasp to be executed from the predicted grasp set. To train \regnet, we construct a large-scale grasp dataset containing collision-free grasp configurations using different parallel-jaw grippers. The experimental results demonstrate that \regnet with the analytic policy achieves the highest success rate of $74.98\%$ in real-world clutter scenes with $20$ objects, significantly outperforming several state-of-the-art methods, including GPD, PointNetGPD, and S4G. The code and dataset are available at https://github.com/zhaobinglei/REGNet-V2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09431v1-abstract-full').style.display = 'none'; document.getElementById('2410.09431v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05938">arXiv:2410.05938</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05938">pdf</a>, <a href="https://arxiv.org/format/2410.05938">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EMMA: Empowering Multi-modal Mamba with Structural and Hierarchical Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+Y">Yifei Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruiping Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenjun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Q">Qingfang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05938v1-abstract-short" style="display: inline;"> Mamba-based architectures have shown to be a promising new direction for deep learning models owing to their competitive performance and sub-quadratic deployment speed. However, current Mamba multi-modal large language models (MLLM) are insufficient in extracting visual features, leading to imbalanced cross-modal alignment between visual and textural latents, negatively impacting performance on mu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05938v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05938v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05938v1-abstract-full" style="display: none;"> Mamba-based architectures have shown to be a promising new direction for deep learning models owing to their competitive performance and sub-quadratic deployment speed. However, current Mamba multi-modal large language models (MLLM) are insufficient in extracting visual features, leading to imbalanced cross-modal alignment between visual and textural latents, negatively impacting performance on multi-modal tasks. In this work, we propose Empowering Multi-modal Mamba with Structural and Hierarchical Alignment (EMMA), which enables the MLLM to extract fine-grained visual information. Specifically, we propose a pixel-wise alignment module to autoregressively optimize the learning and processing of spatial image-level features along with textual tokens, enabling structural alignment at the image level. In addition, to prevent the degradation of visual information during the cross-model alignment process, we propose a multi-scale feature fusion (MFF) module to combine multi-scale visual features from intermediate layers, enabling hierarchical alignment at the feature level. Extensive experiments are conducted across a variety of multi-modal benchmarks. Our model shows lower latency than other Mamba-based MLLMs and is nearly four times faster than transformer-based MLLMs of similar scale during inference. Due to better cross-modal alignment, our model exhibits lower degrees of hallucination and enhanced sensitivity to visual details, which manifests in superior performance across diverse multi-modal benchmarks. Code will be provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05938v1-abstract-full').style.display = 'none'; document.getElementById('2410.05938v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05767">arXiv:2410.05767</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05767">pdf</a>, <a href="https://arxiv.org/format/2410.05767">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Grounding is All You Need? Dual Temporal Grounding for Video Dialog </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Y">You Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xinze Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zimmermann%2C+R">Roger Zimmermann</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+L">Lizi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05767v2-abstract-short" style="display: inline;"> In the realm of video dialog response generation, the understanding of video content and the temporal nuances of conversation history are paramount. While a segment of current research leans heavily on large-scale pretrained visual-language models and often overlooks temporal dynamics, another delves deep into spatial-temporal relationships within videos but demands intricate object trajectory pre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05767v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05767v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05767v2-abstract-full" style="display: none;"> In the realm of video dialog response generation, the understanding of video content and the temporal nuances of conversation history are paramount. While a segment of current research leans heavily on large-scale pretrained visual-language models and often overlooks temporal dynamics, another delves deep into spatial-temporal relationships within videos but demands intricate object trajectory pre-extractions and sidelines dialog temporal dynamics. This paper introduces the Dual Temporal Grounding-enhanced Video Dialog model (DTGVD), strategically designed to merge the strengths of both dominant approaches. It emphasizes dual temporal relationships by predicting dialog turn-specific temporal regions, filtering video content accordingly, and grounding responses in both video and dialog contexts. One standout feature of DTGVD is its heightened attention to chronological interplay. By recognizing and acting upon the dependencies between different dialog turns, it captures more nuanced conversational dynamics. To further bolster the alignment between video and dialog temporal dynamics, we&#39;ve implemented a list-wise contrastive learning strategy. Within this framework, accurately grounded turn-clip pairings are designated as positive samples, while less precise pairings are categorized as negative. This refined classification is then funneled into our holistic end-to-end response generation mechanism. Evaluations using AVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our methodology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05767v2-abstract-full').style.display = 'none'; document.getElementById('2410.05767v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02664">arXiv:2410.02664</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02664">pdf</a>, <a href="https://arxiv.org/format/2410.02664">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Grounded Answers for Multi-agent Decision-making Problem through Generative World Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xinrui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+S">Shiguang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+L">Long Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+L">Lipeng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02664v1-abstract-short" style="display: inline;"> Recent progress in generative models has stimulated significant innovations in many fields, such as image generation and chatbots. Despite their success, these models often produce sketchy and misleading solutions for complex multi-agent decision-making problems because they miss the trial-and-error experience and reasoning as humans. To address this limitation, we explore a paradigm that integrat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02664v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02664v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02664v1-abstract-full" style="display: none;"> Recent progress in generative models has stimulated significant innovations in many fields, such as image generation and chatbots. Despite their success, these models often produce sketchy and misleading solutions for complex multi-agent decision-making problems because they miss the trial-and-error experience and reasoning as humans. To address this limitation, we explore a paradigm that integrates a language-guided simulator into the multi-agent reinforcement learning pipeline to enhance the generated answer. The simulator is a world model that separately learns dynamics and reward, where the dynamics model comprises an image tokenizer as well as a causal transformer to generate interaction transitions autoregressively, and the reward model is a bidirectional transformer learned by maximizing the likelihood of trajectories in the expert demonstrations under language guidance. Given an image of the current state and the task description, we use the world model to train the joint policy and produce the image sequence as the answer by running the converged policy on the dynamics model. The empirical results demonstrate that this framework can improve the answers for multi-agent decision-making problems by showing superior performance on the training and unseen tasks of the StarCraft Multi-Agent Challenge benchmark. In particular, it can generate consistent interaction sequences and explainable reward functions at interaction states, opening the path for training generative models of the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02664v1-abstract-full').style.display = 'none'; document.getElementById('2410.02664v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The Thirty-eighth Annual Conference on Neural Information Processing Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08444">arXiv:2409.08444</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.08444">pdf</a>, <a href="https://arxiv.org/format/2409.08444">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Unified Facial Action Unit Recognition Framework by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+G">Guohong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xing Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hanyu Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+J">Jiayi Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+J">Jian Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08444v1-abstract-short" style="display: inline;"> Facial Action Units (AUs) are of great significance in the realm of affective computing. In this paper, we propose AU-LLaVA, the first unified AU recognition framework based on the Large Language Model (LLM). AU-LLaVA consists of a visual encoder, a linear projector layer, and a pre-trained LLM. We meticulously craft the text descriptions and fine-tune the model on various AU datasets, allowing it&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08444v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08444v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08444v1-abstract-full" style="display: none;"> Facial Action Units (AUs) are of great significance in the realm of affective computing. In this paper, we propose AU-LLaVA, the first unified AU recognition framework based on the Large Language Model (LLM). AU-LLaVA consists of a visual encoder, a linear projector layer, and a pre-trained LLM. We meticulously craft the text descriptions and fine-tune the model on various AU datasets, allowing it to generate different formats of AU recognition results for the same input image. On the BP4D and DISFA datasets, AU-LLaVA delivers the most accurate recognition results for nearly half of the AUs. Our model achieves improvements of F1-score up to 11.4% in specific AU recognition compared to previous benchmark results. On the FEAFA dataset, our method achieves significant improvements over all 24 AUs compared to previous benchmark results. AU-LLaVA demonstrates exceptional performance and versatility in AU recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08444v1-abstract-full').style.display = 'none'; document.getElementById('2409.08444v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07129">arXiv:2409.07129</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07129">pdf</a>, <a href="https://arxiv.org/format/2409.07129">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVLLaVA: An Intelligent Agent for Unified and Flexible Novel View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hanyu Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+J">Jian Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xing Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+G">Guohong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Ke Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07129v1-abstract-short" style="display: inline;"> This paper introduces MVLLaVA, an intelligent agent designed for novel view synthesis tasks. MVLLaVA integrates multiple multi-view diffusion models with a large multimodal model, LLaVA, enabling it to handle a wide range of tasks efficiently. MVLLaVA represents a versatile and unified platform that adapts to diverse input types, including a single image, a descriptive caption, or a specific chang&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07129v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07129v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07129v1-abstract-full" style="display: none;"> This paper introduces MVLLaVA, an intelligent agent designed for novel view synthesis tasks. MVLLaVA integrates multiple multi-view diffusion models with a large multimodal model, LLaVA, enabling it to handle a wide range of tasks efficiently. MVLLaVA represents a versatile and unified platform that adapts to diverse input types, including a single image, a descriptive caption, or a specific change in viewing azimuth, guided by language instructions for viewpoint generation. We carefully craft task-specific instruction templates, which are subsequently used to fine-tune LLaVA. As a result, MVLLaVA acquires the capability to generate novel view images based on user instructions, demonstrating its flexibility across diverse tasks. Experiments are conducted to validate the effectiveness of MVLLaVA, demonstrating its robust performance and versatility in tackling diverse novel view synthesis challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07129v1-abstract-full').style.display = 'none'; document.getElementById('2409.07129v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://jamesjg.github.io/MVLLaVA_homepage/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05493">arXiv:2409.05493</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05493">pdf</a>, <a href="https://arxiv.org/format/2409.05493">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DexDiff: Towards Extrinsic Dexterity Manipulation of Ungraspable Objects in Unrestricted Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chengzhong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Houxue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+N">Nanning Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05493v1-abstract-short" style="display: inline;"> Grasping large and flat objects (e.g. a book or a pan) is often regarded as an ungraspable task, which poses significant challenges due to the unreachable grasping poses. Previous works leverage Extrinsic Dexterity like walls or table edges to grasp such objects. However, they are limited to task-specific policies and lack task planning to find pre-grasp conditions. This makes it difficult to adap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05493v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05493v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05493v1-abstract-full" style="display: none;"> Grasping large and flat objects (e.g. a book or a pan) is often regarded as an ungraspable task, which poses significant challenges due to the unreachable grasping poses. Previous works leverage Extrinsic Dexterity like walls or table edges to grasp such objects. However, they are limited to task-specific policies and lack task planning to find pre-grasp conditions. This makes it difficult to adapt to various environments and extrinsic dexterity constraints. Therefore, we present DexDiff, a robust robotic manipulation method for long-horizon planning with extrinsic dexterity. Specifically, we utilize a vision-language model (VLM) to perceive the environmental state and generate high-level task plans, followed by a goal-conditioned action diffusion (GCAD) model to predict the sequence of low-level actions. This model learns the low-level policy from offline data with the cumulative reward guided by high-level planning as the goal condition, which allows for improved prediction of robot actions. Experimental results demonstrate that our method not only effectively performs ungraspable tasks but also generalizes to previously unseen objects. It outperforms baselines by a 47% higher success rate in simulation and facilitates efficient deployment and manipulation in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05493v1-abstract-full').style.display = 'none'; document.getElementById('2409.05493v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02828">arXiv:2409.02828</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02828">pdf</a>, <a href="https://arxiv.org/format/2409.02828">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ExpLLM: Towards Chain of Thought for Facial Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xing Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+J">Jian Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Ji Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Ke Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02828v1-abstract-short" style="display: inline;"> Facial expression recognition (FER) is a critical task in multimedia with significant implications across various domains. However, analyzing the causes of facial expressions is essential for accurately recognizing them. Current approaches, such as those based on facial action units (AUs), typically provide AU names and intensities but lack insight into the interactions and relationships between A&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02828v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02828v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02828v1-abstract-full" style="display: none;"> Facial expression recognition (FER) is a critical task in multimedia with significant implications across various domains. However, analyzing the causes of facial expressions is essential for accurately recognizing them. Current approaches, such as those based on facial action units (AUs), typically provide AU names and intensities but lack insight into the interactions and relationships between AUs and the overall expression. In this paper, we propose a novel method called ExpLLM, which leverages large language models to generate an accurate chain of thought (CoT) for facial expression recognition. Specifically, we have designed the CoT mechanism from three key perspectives: key observations, overall emotional interpretation, and conclusion. The key observations describe the AU&#39;s name, intensity, and associated emotions. The overall emotional interpretation provides an analysis based on multiple AUs and their interactions, identifying the dominant emotions and their relationships. Finally, the conclusion presents the final expression label derived from the preceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed to construct this expression CoT and generate instruction-description data for training our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets demonstrate that ExpLLM outperforms current state-of-the-art FER methods. ExpLLM also surpasses the latest GPT-4o in expression CoT generation, particularly in recognizing micro-expressions where GPT-4o frequently fails. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02828v1-abstract-full').style.display = 'none'; document.getElementById('2409.02828v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://starhiking.github.io/ExpLLM_Page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11135">arXiv:2408.11135</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.11135">pdf</a>, <a href="https://arxiv.org/format/2408.11135">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MS$^3$D: A RG Flow-Based Regularization for GAN Training with Limited Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xin Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yuxin Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+J">Jiancheng Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11135v1-abstract-short" style="display: inline;"> Generative adversarial networks (GANs) have made impressive advances in image generation, but they often require large-scale training data to avoid degradation caused by discriminator overfitting. To tackle this issue, we investigate the challenge of training GANs with limited data, and propose a novel regularization method based on the idea of renormalization group (RG) in physics.We observe that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11135v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11135v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11135v1-abstract-full" style="display: none;"> Generative adversarial networks (GANs) have made impressive advances in image generation, but they often require large-scale training data to avoid degradation caused by discriminator overfitting. To tackle this issue, we investigate the challenge of training GANs with limited data, and propose a novel regularization method based on the idea of renormalization group (RG) in physics.We observe that in the limited data setting, the gradient pattern that the generator obtains from the discriminator becomes more aggregated over time. In RG context, this aggregated pattern exhibits a high discrepancy from its coarse-grained versions, which implies a high-capacity and sensitive system, prone to overfitting and collapse. To address this problem, we introduce a \textbf{m}ulti-\textbf{s}cale \textbf{s}tructural \textbf{s}elf-\textbf{d}issimilarity (MS$^3$D) regularization, which constrains the gradient field to have a consistent pattern across different scales, thereby fostering a more redundant and robust system. We show that our method can effectively enhance the performance and stability of GANs under limited data scenarios, and even allow them to generate high-quality images with very few data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11135v1-abstract-full').style.display = 'none'; document.getElementById('2408.11135v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10548">arXiv:2408.10548</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10548">pdf</a>, <a href="https://arxiv.org/format/2408.10548">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Language Modeling on Tabular Data: A Survey of Foundations, Techniques and Evolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+Y">Yucheng Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jingying Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yizhi Dong</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+K">Kai He</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Mengling Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10548v1-abstract-short" style="display: inline;"> Tabular data, a prevalent data type across various domains, presents unique challenges due to its heterogeneous nature and complex structural relationships. Achieving high predictive performance and robustness in tabular data analysis holds significant promise for numerous applications. Influenced by recent advancements in natural language processing, particularly transformer architectures, new me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10548v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10548v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10548v1-abstract-full" style="display: none;"> Tabular data, a prevalent data type across various domains, presents unique challenges due to its heterogeneous nature and complex structural relationships. Achieving high predictive performance and robustness in tabular data analysis holds significant promise for numerous applications. Influenced by recent advancements in natural language processing, particularly transformer architectures, new methods for tabular data modeling have emerged. Early techniques concentrated on pre-training transformers from scratch, often encountering scalability issues. Subsequently, methods leveraging pre-trained language models like BERT have been developed, which require less data and yield enhanced performance. The recent advent of large language models, such as GPT and LLaMA, has further revolutionized the field, facilitating more advanced and diverse applications with minimal fine-tuning. Despite the growing interest, a comprehensive survey of language modeling techniques for tabular data remains absent. This paper fills this gap by providing a systematic review of the development of language modeling for tabular data, encompassing: (1) a categorization of different tabular data structures and data types; (2) a review of key datasets used in model training and tasks used for evaluation; (3) a summary of modeling techniques including widely-adopted data processing methods, popular architectures, and training objectives; (4) the evolution from adapting traditional Pre-training/Pre-trained language models to the utilization of large language models; (5) an identification of persistent challenges and potential future research directions in language modeling for tabular data analysis. GitHub page associated with this survey is available at: https://github.com/lanxiang1017/Language-Modeling-on-Tabular-Data-Survey.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10548v1-abstract-full').style.display = 'none'; document.getElementById('2408.10548v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04386">arXiv:2408.04386</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.04386">pdf</a>, <a href="https://arxiv.org/format/2408.04386">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Reflections on Teaching Data Visualization at the Journalism School </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xingyu Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04386v1-abstract-short" style="display: inline;"> The integration of data visualization in journalism has catalyzed the growth of data storytelling in recent years. Today, it is increasingly common for journalism schools to incorporate data visualization into their curricula. However, the approach to teaching data visualization in journalism schools can diverge significantly from that in computer science or design schools, influenced by the varie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04386v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04386v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04386v1-abstract-full" style="display: none;"> The integration of data visualization in journalism has catalyzed the growth of data storytelling in recent years. Today, it is increasingly common for journalism schools to incorporate data visualization into their curricula. However, the approach to teaching data visualization in journalism schools can diverge significantly from that in computer science or design schools, influenced by the varied backgrounds of students and the distinct value systems inherent to these disciplines. This paper reviews my experience and reflections on teaching data visualization in a journalism school. First, I discuss the prominent characteristics of journalism education that pose challenges for course design and teaching. Then, I share firsthand teaching experiences related to each characteristic and recommend approaches for effective teaching. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04386v1-abstract-full').style.display = 'none'; document.getElementById('2408.04386v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11699">arXiv:2407.11699</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11699">pdf</a>, <a href="https://arxiv.org/format/2407.11699">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Relation DETR: Exploring Explicit Position Relation Prior for Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+X">Xiuquan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Meiqin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Senlin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+P">Ping Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Badong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11699v1-abstract-short" style="display: inline;"> This paper presents a general scheme for enhancing the convergence and performance of DETR (DEtection TRansformer). We investigate the slow convergence problem in transformers from a new perspective, suggesting that it arises from the self-attention that introduces no structural bias over inputs. To address this issue, we explore incorporating position relation prior as attention bias to augment o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11699v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11699v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11699v1-abstract-full" style="display: none;"> This paper presents a general scheme for enhancing the convergence and performance of DETR (DEtection TRansformer). We investigate the slow convergence problem in transformers from a new perspective, suggesting that it arises from the self-attention that introduces no structural bias over inputs. To address this issue, we explore incorporating position relation prior as attention bias to augment object detection, following the verification of its statistical significance using a proposed quantitative macroscopic correlation (MC) metric. Our approach, termed Relation-DETR, introduces an encoder to construct position relation embeddings for progressive attention refinement, which further extends the traditional streaming pipeline of DETR into a contrastive relation pipeline to address the conflicts between non-duplicate predictions and positive supervision. Extensive experiments on both generic and task-specific datasets demonstrate the effectiveness of our approach. Under the same configurations, Relation-DETR achieves a significant improvement (+2.0% AP compared to DINO), state-of-the-art performance (51.7% AP for 1x and 52.1% AP for 2x settings), and a remarkably faster convergence speed (over 40% AP with only 2 training epochs) than existing DETR detectors on COCO val2017. Moreover, the proposed relation encoder serves as a universal plug-in-and-play component, bringing clear improvements for theoretically any DETR-like methods. Furthermore, we introduce a class-agnostic detection dataset, SA-Det-100k. The experimental results on the dataset illustrate that the proposed explicit position relation achieves a clear improvement of 1.3% AP, highlighting its potential towards universal object detection. The code and dataset are available at https://github.com/xiuqhou/Relation-DETR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11699v1-abstract-full').style.display = 'none'; document.getElementById('2407.11699v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11497">arXiv:2407.11497</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11497">pdf</a>, <a href="https://arxiv.org/format/2407.11497">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> &#34;I Came Across a Junk&#34;: Understanding Design Flaws of Data Visualization from the Public&#39;s Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xingyu Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11497v3-abstract-short" style="display: inline;"> The visualization community has a rich history of reflecting upon flaws of visualization design, and research in this direction has remained lively until now. However, three main gaps still exist. First, most existing work characterizes design flaws from the perspective of researchers rather than the perspective of general users. Second, little work has been done to infer why these design flaws oc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11497v3-abstract-full').style.display = 'inline'; document.getElementById('2407.11497v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11497v3-abstract-full" style="display: none;"> The visualization community has a rich history of reflecting upon flaws of visualization design, and research in this direction has remained lively until now. However, three main gaps still exist. First, most existing work characterizes design flaws from the perspective of researchers rather than the perspective of general users. Second, little work has been done to infer why these design flaws occur. Third, due to problems such as unclear terminology and ambiguous research scope, a better framework that systematically outlines various design flaws and helps distinguish different types of flaws is desired. To address the above gaps, this work investigated visualization design flaws through the lens of the public, constructed a framework to summarize and categorize the identified flaws, and explored why these flaws occur. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11497v3-abstract-full').style.display = 'none'; document.getElementById('2407.11497v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07844">arXiv:2407.07844</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07844">pdf</a>, <a href="https://arxiv.org/format/2407.07844">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OV-DINO: Unified Open-Vocabulary Detection with Language-Aware Selective Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+P">Pengzhen Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Jie%2C+Z">Zequn Jie</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+X">Xiao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+C">Chengjian Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Y">Yinlong Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lin Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07844v2-abstract-short" style="display: inline;"> Open-vocabulary detection is a challenging task due to the requirement of detecting objects based on class names, including those not encountered during training. Existing methods have shown strong zero-shot detection capabilities through pre-training and pseudo-labeling on diverse large-scale datasets. However, these approaches encounter two main challenges: (i) how to effectively eliminate data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07844v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07844v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07844v2-abstract-full" style="display: none;"> Open-vocabulary detection is a challenging task due to the requirement of detecting objects based on class names, including those not encountered during training. Existing methods have shown strong zero-shot detection capabilities through pre-training and pseudo-labeling on diverse large-scale datasets. However, these approaches encounter two main challenges: (i) how to effectively eliminate data noise from pseudo-labeling, and (ii) how to efficiently leverage the language-aware capability for region-level cross-modality fusion and alignment. To address these challenges, we propose a novel unified open-vocabulary detection method called OV-DINO, which is pre-trained on diverse large-scale datasets with language-aware selective fusion in a unified framework. Specifically, we introduce a Unified Data Integration (UniDI) pipeline to enable end-to-end training and eliminate noise from pseudo-label generation by unifying different data sources into detection-centric data format. In addition, we propose a Language-Aware Selective Fusion (LASF) module to enhance the cross-modality alignment through a language-aware query selection and fusion process. We evaluate the performance of the proposed OV-DINO on popular open-vocabulary detection benchmarks, achieving state-of-the-art results with an AP of 50.6% on the COCO benchmark and 40.1% on the LVIS benchmark in a zero-shot manner, demonstrating its strong generalization ability. Furthermore, the fine-tuned OV-DINO on COCO achieves 58.4% AP, outperforming many existing methods with the same backbone. The code for OV-DINO is available at https://github.com/wanghao9610/OV-DINO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07844v2-abstract-full').style.display = 'none'; document.getElementById('2407.07844v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01622">arXiv:2404.01622</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.01622">pdf</a>, <a href="https://arxiv.org/ps/2404.01622">ps</a>, <a href="https://arxiv.org/format/2404.01622">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Gen4DS: Workshop on Data Storytelling in an Era of Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xingyu Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Leni Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zezhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+D">Danqing Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Carpendale%2C+S">Sheelagh Carpendale</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01622v2-abstract-short" style="display: inline;"> Storytelling is an ancient and precious human ability that has been rejuvenated in the digital age. Over the last decade, there has been a notable surge in the recognition and application of data storytelling, both in academia and industry. Recently, the rapid development of generative AI has brought new opportunities and challenges to this field, sparking numerous new questions. These questions m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01622v2-abstract-full').style.display = 'inline'; document.getElementById('2404.01622v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01622v2-abstract-full" style="display: none;"> Storytelling is an ancient and precious human ability that has been rejuvenated in the digital age. Over the last decade, there has been a notable surge in the recognition and application of data storytelling, both in academia and industry. Recently, the rapid development of generative AI has brought new opportunities and challenges to this field, sparking numerous new questions. These questions may not necessarily be quickly transformed into papers, but we believe it is necessary to promptly discuss them to help the community better clarify important issues and research agendas for the future. We thus invite you to join our workshop (Gen4DS) to discuss questions such as: How can generative AI facilitate the creation of data stories? How might generative AI alter the workflow of data storytellers? What are the pitfalls and risks of incorporating AI in storytelling? We have designed both paper presentations and interactive activities (including hands-on creation, group discussion pods, and debates on controversial issues) for the workshop. We hope that participants will learn about the latest advances and pioneering work in data storytelling, engage in critical conversations with each other, and have an enjoyable, unforgettable, and meaningful experience at the event. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01622v2-abstract-full').style.display = 'none'; document.getElementById('2404.01622v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10750">arXiv:2403.10750</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.10750">pdf</a>, <a href="https://arxiv.org/format/2403.10750">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Depression Detection on Social Media with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yiming Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Sheng%2C+L">Li Sheng</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10750v1-abstract-short" style="display: inline;"> Depression harms. However, due to a lack of mental health awareness and fear of stigma, many patients do not actively seek diagnosis and treatment, leading to detrimental outcomes. Depression detection aims to determine whether an individual suffers from depression by analyzing their history of posts on social media, which can significantly aid in early detection and intervention. It mainly faces&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10750v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10750v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10750v1-abstract-full" style="display: none;"> Depression harms. However, due to a lack of mental health awareness and fear of stigma, many patients do not actively seek diagnosis and treatment, leading to detrimental outcomes. Depression detection aims to determine whether an individual suffers from depression by analyzing their history of posts on social media, which can significantly aid in early detection and intervention. It mainly faces two key challenges: 1) it requires professional medical knowledge, and 2) it necessitates both high accuracy and explainability. To address it, we propose a novel depression detection system called DORIS, combining medical knowledge and the recent advances in large language models (LLMs). Specifically, to tackle the first challenge, we proposed an LLM-based solution to first annotate whether high-risk texts meet medical diagnostic criteria. Further, we retrieve texts with high emotional intensity and summarize critical information from the historical mood records of users, so-called mood courses. To tackle the second challenge, we combine LLM and traditional classifiers to integrate medical knowledge-guided features, for which the model can also explain its prediction results, achieving both high accuracy and explainability. Extensive experimental results on benchmarking datasets show that, compared to the current best baseline, our approach improves by 0.036 in AUPRC, which can be considered significant, demonstrating the effectiveness of our approach and its high value as an NLP application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10750v1-abstract-full').style.display = 'none'; document.getElementById('2403.10750v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.19231">arXiv:2402.19231</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.19231">pdf</a>, <a href="https://arxiv.org/format/2402.19231">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CricaVPR: Cross-image Correlation-aware Representation Learning for Visual Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+F">Feng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+C">Chun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.19231v2-abstract-short" style="display: inline;"> Over the past decade, most methods in visual place recognition (VPR) have used neural networks to produce feature representations. These networks typically produce a global representation of a place image using only this image itself and neglect the cross-image variations (e.g. viewpoint and illumination), which limits their robustness in challenging scenes. In this paper, we propose a robust glob&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.19231v2-abstract-full').style.display = 'inline'; document.getElementById('2402.19231v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.19231v2-abstract-full" style="display: none;"> Over the past decade, most methods in visual place recognition (VPR) have used neural networks to produce feature representations. These networks typically produce a global representation of a place image using only this image itself and neglect the cross-image variations (e.g. viewpoint and illumination), which limits their robustness in challenging scenes. In this paper, we propose a robust global representation method with cross-image correlation awareness for VPR, named CricaVPR. Our method uses the attention mechanism to correlate multiple images within a batch. These images can be taken in the same place with different conditions or viewpoints, or even captured from different places. Therefore, our method can utilize the cross-image variations as a cue to guide the representation learning, which ensures more robust features are produced. To further facilitate the robustness, we propose a multi-scale convolution-enhanced adaptation method to adapt pre-trained visual foundation models to the VPR task, which introduces the multi-scale local information to further enhance the cross-image correlation-aware representation. Experimental results show that our method outperforms state-of-the-art methods by a large margin with significantly less training time. The code is released at https://github.com/Lu-Feng/CricaVPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.19231v2-abstract-full').style.display = 'none'; document.getElementById('2402.19231v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17978">arXiv:2402.17978</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.17978">pdf</a>, <a href="https://arxiv.org/format/2402.17978">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Imagine, Initialize, and Explore: An Effective Exploration Method in Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+L">Lipeng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xinrui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhuoran Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17978v2-abstract-short" style="display: inline;"> Effective exploration is crucial to discovering optimal strategies for multi-agent reinforcement learning (MARL) in complex coordination tasks. Existing methods mainly utilize intrinsic rewards to enable committed exploration or use role-based learning for decomposing joint action spaces instead of directly conducting a collective search in the entire action-observation space. However, they often&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17978v2-abstract-full').style.display = 'inline'; document.getElementById('2402.17978v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17978v2-abstract-full" style="display: none;"> Effective exploration is crucial to discovering optimal strategies for multi-agent reinforcement learning (MARL) in complex coordination tasks. Existing methods mainly utilize intrinsic rewards to enable committed exploration or use role-based learning for decomposing joint action spaces instead of directly conducting a collective search in the entire action-observation space. However, they often face challenges obtaining specific joint action sequences to reach successful states in long-horizon tasks. To address this limitation, we propose Imagine, Initialize, and Explore (IIE), a novel method that offers a promising solution for efficient multi-agent exploration in complex scenarios. IIE employs a transformer model to imagine how the agents reach a critical state that can influence each other&#39;s transition functions. Then, we initialize the environment at this state using a simulator before the exploration phase. We formulate the imagination as a sequence modeling problem, where the states, observations, prompts, actions, and rewards are predicted autoregressively. The prompt consists of timestep-to-go, return-to-go, influence value, and one-shot demonstration, specifying the desired state and trajectory as well as guiding the action generation. By initializing agents at the critical states, IIE significantly increases the likelihood of discovering potentially important under-explored regions. Despite its simplicity, empirical results demonstrate that our method outperforms multi-agent exploration baselines on the StarCraft Multi-Agent Challenge (SMAC) and SMACv2 environments. Particularly, IIE shows improved performance in the sparse-reward SMAC tasks and produces more effective curricula over the initialized states than other generative methods, such as CVAE-GAN and diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17978v2-abstract-full').style.display = 'none'; document.getElementById('2402.17978v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 38th Annual AAAI Conference on Artificial Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16086">arXiv:2402.16086</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.16086">pdf</a>, <a href="https://arxiv.org/format/2402.16086">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1609/aaai.v38i9.28901">10.1609/aaai.v38i9.28901 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Homography Estimation for Visual Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+F">Feng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+S">Shuting Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bingxi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+C">Chun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16086v2-abstract-short" style="display: inline;"> Visual place recognition (VPR) is a fundamental task for many applications such as robot localization and augmented reality. Recently, the hierarchical VPR methods have received considerable attention due to the trade-off between accuracy and efficiency. They usually first use global features to retrieve the candidate images, then verify the spatial consistency of matched local features for re-ran&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16086v2-abstract-full').style.display = 'inline'; document.getElementById('2402.16086v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16086v2-abstract-full" style="display: none;"> Visual place recognition (VPR) is a fundamental task for many applications such as robot localization and augmented reality. Recently, the hierarchical VPR methods have received considerable attention due to the trade-off between accuracy and efficiency. They usually first use global features to retrieve the candidate images, then verify the spatial consistency of matched local features for re-ranking. However, the latter typically relies on the RANSAC algorithm for fitting homography, which is time-consuming and non-differentiable. This makes existing methods compromise to train the network only in global feature extraction. Here, we propose a transformer-based deep homography estimation (DHE) network that takes the dense feature map extracted by a backbone network as input and fits homography for fast and learnable geometric verification. Moreover, we design a re-projection error of inliers loss to train the DHE network without additional homography labels, which can also be jointly trained with the backbone network to help it extract the features that are more suitable for local matching. Extensive experiments on benchmark datasets show that our method can outperform several state-of-the-art methods. And it is more than one order of magnitude faster than the mainstream hierarchical VPR methods using RANSAC. The code is released at https://github.com/Lu-Feng/DHE-VPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16086v2-abstract-full').style.display = 'none'; document.getElementById('2402.16086v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> AAAI 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.14505">arXiv:2402.14505</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.14505">pdf</a>, <a href="https://arxiv.org/format/2402.14505">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Seamless Adaptation of Pre-trained Models for Visual Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+F">Feng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+S">Shuting Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+C">Chun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.14505v3-abstract-short" style="display: inline;"> Recent studies show that vision models pre-trained in generic visual learning tasks with large-scale data can provide useful feature representations for a wide range of visual perception problems. However, few attempts have been made to exploit pre-trained foundation models in visual place recognition (VPR). Due to the inherent difference in training objectives and data between the tasks of model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14505v3-abstract-full').style.display = 'inline'; document.getElementById('2402.14505v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.14505v3-abstract-full" style="display: none;"> Recent studies show that vision models pre-trained in generic visual learning tasks with large-scale data can provide useful feature representations for a wide range of visual perception problems. However, few attempts have been made to exploit pre-trained foundation models in visual place recognition (VPR). Due to the inherent difference in training objectives and data between the tasks of model pre-training and VPR, how to bridge the gap and fully unleash the capability of pre-trained models for VPR is still a key issue to address. To this end, we propose a novel method to realize seamless adaptation of pre-trained models for VPR. Specifically, to obtain both global and local features that focus on salient landmarks for discriminating places, we design a hybrid adaptation method to achieve both global and local adaptation efficiently, in which only lightweight adapters are tuned without adjusting the pre-trained model. Besides, to guide effective adaptation, we propose a mutual nearest neighbor local feature loss, which ensures proper dense local features are produced for local matching and avoids time-consuming spatial verification in re-ranking. Experimental results show that our method outperforms the state-of-the-art methods with less training data and training time, and uses about only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based spatial verification. It ranks 1st on the MSLS challenge leaderboard (at the time of submission). The code is released at https://github.com/Lu-Feng/SelaVPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14505v3-abstract-full').style.display = 'none'; document.getElementById('2402.14505v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11816">arXiv:2402.11816</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.11816">pdf</a>, <a href="https://arxiv.org/format/2402.11816">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning the Unlearned: Mitigating Feature Suppression in Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jihai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+X">Xiaoye Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Mengling Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Hooi%2C+B">Bryan Hooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11816v3-abstract-short" style="display: inline;"> Self-Supervised Contrastive Learning has proven effective in deriving high-quality representations from unlabeled data. However, a major challenge that hinders both unimodal and multimodal contrastive learning is feature suppression, a phenomenon where the trained model captures only a limited portion of the information from the input data while overlooking other potentially valuable content. This&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11816v3-abstract-full').style.display = 'inline'; document.getElementById('2402.11816v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11816v3-abstract-full" style="display: none;"> Self-Supervised Contrastive Learning has proven effective in deriving high-quality representations from unlabeled data. However, a major challenge that hinders both unimodal and multimodal contrastive learning is feature suppression, a phenomenon where the trained model captures only a limited portion of the information from the input data while overlooking other potentially valuable content. This issue often leads to indistinguishable representations for visually similar but semantically different inputs, adversely affecting downstream task performance, particularly those requiring rigorous semantic comprehension. To address this challenge, we propose a novel model-agnostic Multistage Contrastive Learning (MCL) framework. Unlike standard contrastive learning which inherently captures one single biased feature distribution, MCL progressively learns previously unlearned features through feature-aware negative sampling at each stage, where the negative samples of an anchor are exclusively selected from the cluster it was assigned to in preceding stages. Meanwhile, MCL preserves the previously well-learned features by cross-stage representation integration, integrating features across all stages to form final representations. Our comprehensive evaluation demonstrates MCL&#39;s effectiveness and superiority across both unimodal and multimodal contrastive learning, spanning a range of model architectures from ResNet to Vision Transformers (ViT). Remarkably, in tasks where the original CLIP model has shown limitations, MCL dramatically enhances performance, with improvements up to threefold on specific attributes in the recently proposed MMVP benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11816v3-abstract-full').style.display = 'none'; document.getElementById('2402.11816v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 Camera-Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11792">arXiv:2402.11792</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.11792">pdf</a>, <a href="https://arxiv.org/format/2402.11792">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SInViG: A Self-Evolving Interactive Visual Agent for Human-Robot Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xinghang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Huaping Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+T">Tao Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11792v2-abstract-short" style="display: inline;"> Linguistic ambiguity is ubiquitous in our daily lives. Previous works adopted interaction between robots and humans for language disambiguation. Nevertheless, when interactive robots are deployed in daily environments, there are significant challenges for natural human-robot interaction, stemming from complex and unpredictable visual inputs, open-ended interaction, and diverse user demands. In thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11792v2-abstract-full').style.display = 'inline'; document.getElementById('2402.11792v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11792v2-abstract-full" style="display: none;"> Linguistic ambiguity is ubiquitous in our daily lives. Previous works adopted interaction between robots and humans for language disambiguation. Nevertheless, when interactive robots are deployed in daily environments, there are significant challenges for natural human-robot interaction, stemming from complex and unpredictable visual inputs, open-ended interaction, and diverse user demands. In this paper, we present SInViG, which is a self-evolving interactive visual agent for human-robot interaction based on natural languages, aiming to resolve language ambiguity, if any, through multi-turn visual-language dialogues. It continuously and automatically learns from unlabeled images and large language models, without human intervention, to be more robust against visual and linguistic complexity. Benefiting from self-evolving, it sets new state-of-the-art on several interactive visual grounding benchmarks. Moreover, our human-robot interaction experiments show that the evolved models consistently acquire more and more preferences from human users. Besides, we also deployed our model on a Franka robot for interactive manipulation tasks. Results demonstrate that our model can follow diverse user instructions and interact naturally with humans in natural language, despite the complexity and disturbance of the environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11792v2-abstract-full').style.display = 'none'; document.getElementById('2402.11792v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03699">arXiv:2402.03699</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.03699">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic Robotic Development through Collaborative Framework by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luan%2C+Z">Zhirong Luan</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+Y">Yujun Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Rundong Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaruiqi Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liangjun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Badong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03699v2-abstract-short" style="display: inline;"> Despite the remarkable code generation abilities of large language models LLMs, they still face challenges in complex task handling. Robot development, a highly intricate field, inherently demands human involvement in task allocation and collaborative teamwork . To enhance robot development, we propose an innovative automated collaboration framework inspired by real-world robot developers. This fr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03699v2-abstract-full').style.display = 'inline'; document.getElementById('2402.03699v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03699v2-abstract-full" style="display: none;"> Despite the remarkable code generation abilities of large language models LLMs, they still face challenges in complex task handling. Robot development, a highly intricate field, inherently demands human involvement in task allocation and collaborative teamwork . To enhance robot development, we propose an innovative automated collaboration framework inspired by real-world robot developers. This framework employs multiple LLMs in distinct roles analysts, programmers, and testers. Analysts delve deep into user requirements, enabling programmers to produce precise code, while testers fine-tune the parameters based on user feedback for practical robot application. Each LLM tackles diverse, critical tasks within the development process. Clear collaboration rules emulate real world teamwork among LLMs. Analysts, programmers, and testers form a cohesive team overseeing strategy, code, and parameter adjustments . Through this framework, we achieve complex robot development without requiring specialized knowledge, relying solely on non experts participation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03699v2-abstract-full').style.display = 'none'; document.getElementById('2402.03699v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16699">arXiv:2401.16699</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.16699">pdf</a>, <a href="https://arxiv.org/format/2401.16699">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Unified Interactive Visual Grounding in The Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+Q">Qingyi Si</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yifeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+T">Tao Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16699v2-abstract-short" style="display: inline;"> Interactive visual grounding in Human-Robot Interaction (HRI) is challenging yet practical due to the inevitable ambiguity in natural languages. It requires robots to disambiguate the user input by active information gathering. Previous approaches often rely on predefined templates to ask disambiguation questions, resulting in performance reduction in realistic interactive scenarios. In this paper&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16699v2-abstract-full').style.display = 'inline'; document.getElementById('2401.16699v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16699v2-abstract-full" style="display: none;"> Interactive visual grounding in Human-Robot Interaction (HRI) is challenging yet practical due to the inevitable ambiguity in natural languages. It requires robots to disambiguate the user input by active information gathering. Previous approaches often rely on predefined templates to ask disambiguation questions, resulting in performance reduction in realistic interactive scenarios. In this paper, we propose TiO, an end-to-end system for interactive visual grounding in human-robot interaction. Benefiting from a unified formulation of visual dialogue and grounding, our method can be trained on a joint of extensive public data, and show superior generality to diversified and challenging open-world scenarios. In the experiments, we validate TiO on GuessWhat?! and InViG benchmarks, setting new state-of-the-art performance by a clear margin. Moreover, we conduct HRI experiments on the carefully selected 150 challenging scenes as well as real-robot platforms. Results show that our method demonstrates superior generality to diversified visual and language inputs with a high success rate. Codes and demos are available at https://github.com/jxu124/TiO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16699v2-abstract-full').style.display = 'none'; document.getElementById('2401.16699v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICRA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16355">arXiv:2401.16355</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.16355">pdf</a>, <a href="https://arxiv.org/format/2401.16355">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PathMMU: A Massive Multimodal Expert-Level Benchmark for Understanding and Reasoning in Pathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yuxuan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+C">Chenglu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+S">Sunyi Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qizi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yunlong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+D">Dan Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaoxiao Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+M">Mengyue Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jingxiong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+X">Xinheng Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+T">Tao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16355v3-abstract-short" style="display: inline;"> The emergence of large multimodal models has unlocked remarkable potential in AI, particularly in pathology. However, the lack of specialized, high-quality benchmark impeded their development and precise evaluation. To address this, we introduce PathMMU, the largest and highest-quality expert-validated pathology benchmark for Large Multimodal Models (LMMs). It comprises 33,428 multimodal multi-cho&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16355v3-abstract-full').style.display = 'inline'; document.getElementById('2401.16355v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16355v3-abstract-full" style="display: none;"> The emergence of large multimodal models has unlocked remarkable potential in AI, particularly in pathology. However, the lack of specialized, high-quality benchmark impeded their development and precise evaluation. To address this, we introduce PathMMU, the largest and highest-quality expert-validated pathology benchmark for Large Multimodal Models (LMMs). It comprises 33,428 multimodal multi-choice questions and 24,067 images from various sources, each accompanied by an explanation for the correct answer. The construction of PathMMU harnesses GPT-4V&#39;s advanced capabilities, utilizing over 30,000 image-caption pairs to enrich captions and generate corresponding Q&amp;As in a cascading process. Significantly, to maximize PathMMU&#39;s authority, we invite seven pathologists to scrutinize each question under strict standards in PathMMU&#39;s validation and test sets, while simultaneously setting an expert-level performance benchmark for PathMMU. We conduct extensive evaluations, including zero-shot assessments of 14 open-sourced and 4 closed-sourced LMMs and their robustness to image corruption. We also fine-tune representative LMMs to assess their adaptability to PathMMU. The empirical findings indicate that advanced LMMs struggle with the challenging PathMMU benchmark, with the top-performing LMM, GPT-4V, achieving only a 49.8% zero-shot performance, significantly lower than the 71.8% demonstrated by human pathologists. After fine-tuning, significantly smaller open-sourced LMMs can outperform GPT-4V but still fall short of the expertise shown by pathologists. We hope that the PathMMU will offer valuable insights and foster the development of more specialized, next-generation LMMs for pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16355v3-abstract-full').style.display = 'none'; document.getElementById('2401.16355v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11970">arXiv:2312.11970</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.11970">pdf</a>, <a href="https://arxiv.org/format/2312.11970">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models Empowered Agent-based Modeling and Simulation: A Survey and Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+J">Jingtao Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhilun Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+F">Fengli Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11970v1-abstract-short" style="display: inline;"> Agent-based modeling and simulation has evolved as a powerful tool for modeling complex systems, offering insights into emergent behaviors and interactions among diverse agents. Integrating large language models into agent-based modeling and simulation presents a promising avenue for enhancing simulation capabilities. This paper surveys the landscape of utilizing large language models in agent-bas&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11970v1-abstract-full').style.display = 'inline'; document.getElementById('2312.11970v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11970v1-abstract-full" style="display: none;"> Agent-based modeling and simulation has evolved as a powerful tool for modeling complex systems, offering insights into emergent behaviors and interactions among diverse agents. Integrating large language models into agent-based modeling and simulation presents a promising avenue for enhancing simulation capabilities. This paper surveys the landscape of utilizing large language models in agent-based modeling and simulation, examining their challenges and promising future directions. In this survey, since this is an interdisciplinary field, we first introduce the background of agent-based modeling and simulation and large language model-empowered agents. We then discuss the motivation for applying large language models to agent-based simulation and systematically analyze the challenges in environment perception, human alignment, action generation, and evaluation. Most importantly, we provide a comprehensive overview of the recent works of large language model-empowered agent-based modeling and simulation in multiple scenarios, which can be divided into four domains: cyber, physical, social, and hybrid, covering simulation of both real-world and virtual environments. Finally, since this area is new and quickly evolving, we discuss the open problems and promising future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11970v1-abstract-full').style.display = 'none'; document.getElementById('2312.11970v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">37 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.10467">arXiv:2310.10467</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.10467">pdf</a>, <a href="https://arxiv.org/format/2310.10467">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Stance Detection with Collaborative Role-Infused LLM-Based Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+D">Depeng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.10467v2-abstract-short" style="display: inline;"> Stance detection automatically detects the stance in a text towards a target, vital for content analysis in web and social media research. Despite their promising capabilities, LLMs encounter challenges when directly applied to stance detection. First, stance detection demands multi-aspect knowledge, from deciphering event-related terminologies to understanding the expression styles in social medi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10467v2-abstract-full').style.display = 'inline'; document.getElementById('2310.10467v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.10467v2-abstract-full" style="display: none;"> Stance detection automatically detects the stance in a text towards a target, vital for content analysis in web and social media research. Despite their promising capabilities, LLMs encounter challenges when directly applied to stance detection. First, stance detection demands multi-aspect knowledge, from deciphering event-related terminologies to understanding the expression styles in social media platforms. Second, stance detection requires advanced reasoning to infer authors&#39; implicit viewpoints, as stance are often subtly embedded rather than overtly stated in the text. To address these challenges, we design a three-stage framework COLA (short for Collaborative rOle-infused LLM-based Agents) in which LLMs are designated distinct roles, creating a collaborative system where each role contributes uniquely. Initially, in the multidimensional text analysis stage, we configure the LLMs to act as a linguistic expert, a domain specialist, and a social media veteran to get a multifaceted analysis of texts, thus overcoming the first challenge. Next, in the reasoning-enhanced debating stage, for each potential stance, we designate a specific LLM-based agent to advocate for it, guiding the LLM to detect logical connections between text features and stance, tackling the second challenge. Finally, in the stance conclusion stage, a final decision maker agent consolidates prior insights to determine the stance. Our approach avoids extra annotated data and model training and is highly usable. We achieve state-of-the-art performance across multiple datasets. Ablation studies validate the effectiveness of each design role in handling stance detection. Further experiments have demonstrated the explainability and the versatility of our approach. Our approach excels in usability, accuracy, effectiveness, explainability and versatility, highlighting its value. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10467v2-abstract-full').style.display = 'none'; document.getElementById('2310.10467v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05694">arXiv:2310.05694</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.05694">pdf</a>, <a href="https://arxiv.org/format/2310.05694">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Large Language Models for Healthcare: from Data, Technology, and Applications to Accountability and Ethics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+K">Kai He</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+R">Rui Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qika Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+Y">Yucheng Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Mengling Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Cambria%2C+E">Erik Cambria</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05694v2-abstract-short" style="display: inline;"> The utilization of large language models (LLMs) in the Healthcare domain has generated both excitement and concern due to their ability to effectively respond to freetext queries with certain professional knowledge. This survey outlines the capabilities of the currently developed LLMs for Healthcare and explicates their development process, with the aim of providing an overview of the development&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05694v2-abstract-full').style.display = 'inline'; document.getElementById('2310.05694v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05694v2-abstract-full" style="display: none;"> The utilization of large language models (LLMs) in the Healthcare domain has generated both excitement and concern due to their ability to effectively respond to freetext queries with certain professional knowledge. This survey outlines the capabilities of the currently developed LLMs for Healthcare and explicates their development process, with the aim of providing an overview of the development roadmap from traditional Pretrained Language Models (PLMs) to LLMs. Specifically, we first explore the potential of LLMs to enhance the efficiency and effectiveness of various Healthcare applications highlighting both the strengths and limitations. Secondly, we conduct a comparison between the previous PLMs and the latest LLMs, as well as comparing various LLMs with each other. Then we summarize related Healthcare training data, training methods, optimization strategies, and usage. Finally, the unique concerns associated with deploying LLMs in Healthcare settings are investigated, particularly regarding fairness, accountability, transparency and ethics. Our survey provide a comprehensive investigation from perspectives of both computer science and Healthcare specialty. Besides the discussion about Healthcare concerns, we supports the computer science community by compiling a collection of open source resources, such as accessible datasets, the latest methodologies, code implementations, and evaluation benchmarks in the Github. Summarily, we contend that a significant paradigm shift is underway, transitioning from PLMs to LLMs. This shift encompasses a move from discriminative AI approaches to generative AI approaches, as well as a shift from model-centered methodologies to data-centered methodologies. Also, we determine that the biggest obstacle of using LLMs in Healthcare are fairness, accountability, transparency and ethics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05694v2-abstract-full').style.display = 'none'; document.getElementById('2310.05694v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05938">arXiv:2308.05938</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.05938">pdf</a>, <a href="https://arxiv.org/format/2308.05938">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2023.3330047">10.1109/TMM.2023.3330047 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FoodSAM: Any Food Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xing Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+J">Jiayi Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hanyu Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+K">Kun Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+Z">Zehai Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+J">Jian Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05938v1-abstract-short" style="display: inline;"> In this paper, we explore the zero-shot capability of the Segment Anything Model (SAM) for food image segmentation. To address the lack of class-specific information in SAM-generated masks, we propose a novel framework, called FoodSAM. This innovative approach integrates the coarse semantic mask with SAM-generated masks to enhance semantic segmentation quality. Besides, we recognize that the ingre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05938v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05938v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05938v1-abstract-full" style="display: none;"> In this paper, we explore the zero-shot capability of the Segment Anything Model (SAM) for food image segmentation. To address the lack of class-specific information in SAM-generated masks, we propose a novel framework, called FoodSAM. This innovative approach integrates the coarse semantic mask with SAM-generated masks to enhance semantic segmentation quality. Besides, we recognize that the ingredients in food can be supposed as independent individuals, which motivated us to perform instance segmentation on food images. Furthermore, FoodSAM extends its zero-shot capability to encompass panoptic segmentation by incorporating an object detector, which renders FoodSAM to effectively capture non-food object information. Drawing inspiration from the recent success of promptable segmentation, we also extend FoodSAM to promptable segmentation, supporting various prompt variants. Consequently, FoodSAM emerges as an all-encompassing solution capable of segmenting food items at multiple levels of granularity. Remarkably, this pioneering framework stands as the first-ever work to achieve instance, panoptic, and promptable segmentation on food images. Extensive experiments demonstrate the feasibility and impressing performance of FoodSAM, validating SAM&#39;s potential as a prominent and influential tool within the domain of food image segmentation. We release our code at https://github.com/jamesjg/FoodSAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05938v1-abstract-full').style.display = 'none'; document.getElementById('2308.05938v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/jamesjg/FoodSAM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02831">arXiv:2308.02831</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.02831">pdf</a>, <a href="https://arxiv.org/format/2308.02831">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Affective Visualization Design: Leveraging the Emotional Impact of Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xingyu Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yanqiu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+N">Nan Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02831v2-abstract-short" style="display: inline;"> In recent years, more and more researchers have reflected on the undervaluation of emotion in data visualization and highlighted the importance of considering human emotion in visualization design. Meanwhile, an increasing number of studies have been conducted to explore emotion-related factors. However, so far, this research area is still in its early stages and faces a set of challenges, such as&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02831v2-abstract-full').style.display = 'inline'; document.getElementById('2308.02831v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02831v2-abstract-full" style="display: none;"> In recent years, more and more researchers have reflected on the undervaluation of emotion in data visualization and highlighted the importance of considering human emotion in visualization design. Meanwhile, an increasing number of studies have been conducted to explore emotion-related factors. However, so far, this research area is still in its early stages and faces a set of challenges, such as the unclear definition of key concepts, the insufficient justification of why emotion is important in visualization design, and the lack of characterization of the design space of affective visualization design. To address these challenges, first, we conducted a literature review and identified three research lines that examined both emotion and data visualization. We clarified the differences between these research lines and kept 109 papers that studied or discussed how data visualization communicates and influences emotion. Then, we coded the 109 papers in terms of how they justified the legitimacy of considering emotion in visualization design (i.e., why emotion is important) and identified five argumentative perspectives. Based on these papers, we also identified 61 projects that practiced affective visualization design. We coded these design projects in three dimensions, including design fields (where), design tasks (what), and design methods (how), to explore the design space of affective visualization design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02831v2-abstract-full').style.display = 'none'; document.getElementById('2308.02831v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to appear at IEEE VIS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.16644">arXiv:2307.16644</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.16644">pdf</a>, <a href="https://arxiv.org/format/2307.16644">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3580305.3599874">10.1145/3580305.3599874 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> NEON: Living Needs Prediction System in Meituan </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+S">Shiqi Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiuqi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Che%2C+Y">Yingge Che</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+H">Huazhou Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Hengliang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.16644v1-abstract-short" style="display: inline;"> Living needs refer to the various needs in human&#39;s daily lives for survival and well-being, including food, housing, entertainment, etc. On life service platforms that connect users to service providers, such as Meituan, the problem of living needs prediction is fundamental as it helps understand users and boost various downstream applications such as personalized recommendation. However, the prob&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16644v1-abstract-full').style.display = 'inline'; document.getElementById('2307.16644v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.16644v1-abstract-full" style="display: none;"> Living needs refer to the various needs in human&#39;s daily lives for survival and well-being, including food, housing, entertainment, etc. On life service platforms that connect users to service providers, such as Meituan, the problem of living needs prediction is fundamental as it helps understand users and boost various downstream applications such as personalized recommendation. However, the problem has not been well explored and is faced with two critical challenges. First, the needs are naturally connected to specific locations and times, suffering from complex impacts from the spatiotemporal context. Second, there is a significant gap between users&#39; actual living needs and their historical records on the platform. To address these two challenges, we design a system of living NEeds predictiON named NEON, consisting of three phases: feature mining, feature fusion, and multi-task prediction. In the feature mining phase, we carefully extract individual-level user features for spatiotemporal modeling, and aggregated-level behavioral features for enriching data, which serve as the basis for addressing two challenges, respectively. Further, in the feature fusion phase, we propose a neural network that effectively fuses two parts of features into the user representation. Moreover, we design a multi-task prediction phase, where the auxiliary task of needs-meeting way prediction can enhance the modeling of spatiotemporal context. Extensive offline evaluations verify that our NEON system can effectively predict users&#39; living needs. Furthermore, we deploy NEON into Meituan&#39;s algorithm engine and evaluate how it enhances the three downstream prediction applications, via large-scale online A/B testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16644v1-abstract-full').style.display = 'none'; document.getElementById('2307.16644v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.14984">arXiv:2307.14984</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.14984">pdf</a>, <a href="https://arxiv.org/format/2307.14984">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> S3: Social-network Simulation System with Large Language Model-Empowered Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zhihong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+J">Jinzhu Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Piao%2C+J">Jinghua Piao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Huandong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+D">Depeng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.14984v2-abstract-short" style="display: inline;"> Social network simulation plays a crucial role in addressing various challenges within social science. It offers extensive applications such as state prediction, phenomena explanation, and policy-making support, among others. In this work, we harness the formidable human-like capabilities exhibited by large language models (LLMs) in sensing, reasoning, and behaving, and utilize these qualities to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14984v2-abstract-full').style.display = 'inline'; document.getElementById('2307.14984v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.14984v2-abstract-full" style="display: none;"> Social network simulation plays a crucial role in addressing various challenges within social science. It offers extensive applications such as state prediction, phenomena explanation, and policy-making support, among others. In this work, we harness the formidable human-like capabilities exhibited by large language models (LLMs) in sensing, reasoning, and behaving, and utilize these qualities to construct the S$^3$ system (short for $\textbf{S}$ocial network $\textbf{S}$imulation $\textbf{S}$ystem). Adhering to the widely employed agent-based simulation paradigm, we employ prompt engineering and prompt tuning techniques to ensure that the agent&#39;s behavior closely emulates that of a genuine human within the social network. Specifically, we simulate three pivotal aspects: emotion, attitude, and interaction behaviors. By endowing the agent in the system with the ability to perceive the informational environment and emulate human actions, we observe the emergence of population-level phenomena, including the propagation of information, attitudes, and emotions. We conduct an evaluation encompassing two levels of simulation, employing real-world social network data. Encouragingly, the results demonstrate promising accuracy. This work represents an initial step in the realm of social network simulation empowered by LLM-based agents. We anticipate that our endeavors will serve as a source of inspiration for the development of simulation systems within, but not limited to, social science. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14984v2-abstract-full').style.display = 'none'; document.getElementById('2307.14984v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11458">arXiv:2307.11458</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.11458">pdf</a>, <a href="https://arxiv.org/format/2307.11458">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Strip-MLP: Efficient Token Interaction for Vision MLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+G">Guiping Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+S">Shengda Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenjian Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianguo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11458v1-abstract-short" style="display: inline;"> Token interaction operation is one of the core modules in MLP-based models to exchange and aggregate information between different spatial locations. However, the power of token interaction on the spatial dimension is highly dependent on the spatial resolution of the feature maps, which limits the model&#39;s expressive ability, especially in deep layers where the feature are down-sampled to a small s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11458v1-abstract-full').style.display = 'inline'; document.getElementById('2307.11458v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11458v1-abstract-full" style="display: none;"> Token interaction operation is one of the core modules in MLP-based models to exchange and aggregate information between different spatial locations. However, the power of token interaction on the spatial dimension is highly dependent on the spatial resolution of the feature maps, which limits the model&#39;s expressive ability, especially in deep layers where the feature are down-sampled to a small spatial size. To address this issue, we present a novel method called \textbf{Strip-MLP} to enrich the token interaction power in three ways. Firstly, we introduce a new MLP paradigm called Strip MLP layer that allows the token to interact with other tokens in a cross-strip manner, enabling the tokens in a row (or column) to contribute to the information aggregations in adjacent but different strips of rows (or columns). Secondly, a \textbf{C}ascade \textbf{G}roup \textbf{S}trip \textbf{M}ixing \textbf{M}odule (CGSMM) is proposed to overcome the performance degradation caused by small spatial feature size. The module allows tokens to interact more effectively in the manners of within-patch and cross-patch, which is independent to the feature spatial size. Finally, based on the Strip MLP layer, we propose a novel \textbf{L}ocal \textbf{S}trip \textbf{M}ixing \textbf{M}odule (LSMM) to boost the token interaction power in the local region. Extensive experiments demonstrate that Strip-MLP significantly improves the performance of MLP-based models on small datasets and obtains comparable or even better results on ImageNet. In particular, Strip-MLP models achieve higher average Top-1 accuracy than existing MLP-based models by +2.44\% on Caltech-101 and +2.16\% on CIFAR-100. The source codes will be available at~\href{https://github.com/Med-Process/Strip_MLP{https://github.com/Med-Process/Strip\_MLP}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11458v1-abstract-full').style.display = 'none'; document.getElementById('2307.11458v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09193">arXiv:2307.09193</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.09193">pdf</a>, <a href="https://arxiv.org/format/2307.09193">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ESMC: Entire Space Multi-Task Model for Post-Click Conversion Rate via Parameter Constraint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhenhao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+B">Biao Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+H">Hao Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+J">Jicong Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+J">Jia Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+N">Ning Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09193v2-abstract-short" style="display: inline;"> Large-scale online recommender system spreads all over the Internet being in charge of two basic tasks: Click-Through Rate (CTR) and Post-Click Conversion Rate (CVR) estimations. However, traditional CVR estimators suffer from well-known Sample Selection Bias and Data Sparsity issues. Entire space models were proposed to address the two issues via tracing the decision-making path of &#34;exposure_clic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09193v2-abstract-full').style.display = 'inline'; document.getElementById('2307.09193v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09193v2-abstract-full" style="display: none;"> Large-scale online recommender system spreads all over the Internet being in charge of two basic tasks: Click-Through Rate (CTR) and Post-Click Conversion Rate (CVR) estimations. However, traditional CVR estimators suffer from well-known Sample Selection Bias and Data Sparsity issues. Entire space models were proposed to address the two issues via tracing the decision-making path of &#34;exposure_click_purchase&#34;. Further, some researchers observed that there are purchase-related behaviors between click and purchase, which can better draw the user&#39;s decision-making intention and improve the recommendation performance. Thus, the decision-making path has been extended to &#34;exposure_click_in-shop action_purchase&#34; and can be modeled with conditional probability approach. Nevertheless, we observe that the chain rule of conditional probability does not always hold. We report Probability Space Confusion (PSC) issue and give a derivation of difference between ground-truth and estimation mathematically. We propose a novel Entire Space Multi-Task Model for Post-Click Conversion Rate via Parameter Constraint (ESMC) and two alternatives: Entire Space Multi-Task Model with Siamese Network (ESMS) and Entire Space Multi-Task Model in Global Domain (ESMG) to address the PSC issue. Specifically, we handle &#34;exposure_click_in-shop action&#34; and &#34;in-shop action_purchase&#34; separately in the light of characteristics of in-shop action. The first path is still treated with conditional probability while the second one is treated with parameter constraint strategy. Experiments on both offline and online environments in a large-scale recommendation system illustrate the superiority of our proposed methods over state-of-the-art models. The real-world datasets will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09193v2-abstract-full').style.display = 'none'; document.getElementById('2307.09193v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.12592">arXiv:2304.12592</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.12592">pdf</a>, <a href="https://arxiv.org/format/2304.12592">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MMRDN: Consistent Representation for Multi-View Manipulation Relationship Detection in Object-Stacked Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiayuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+L">Lipeng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+N">Nanning Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.12592v1-abstract-short" style="display: inline;"> Manipulation relationship detection (MRD) aims to guide the robot to grasp objects in the right order, which is important to ensure the safety and reliability of grasping in object stacked scenes. Previous works infer manipulation relationship by deep neural network trained with data collected from a predefined view, which has limitation in visual dislocation in unstructured environments. Multi-vi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12592v1-abstract-full').style.display = 'inline'; document.getElementById('2304.12592v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.12592v1-abstract-full" style="display: none;"> Manipulation relationship detection (MRD) aims to guide the robot to grasp objects in the right order, which is important to ensure the safety and reliability of grasping in object stacked scenes. Previous works infer manipulation relationship by deep neural network trained with data collected from a predefined view, which has limitation in visual dislocation in unstructured environments. Multi-view data provide more comprehensive information in space, while a challenge of multi-view MRD is domain shift. In this paper, we propose a novel multi-view fusion framework, namely multi-view MRD network (MMRDN), which is trained by 2D and 3D multi-view data. We project the 2D data from different views into a common hidden space and fit the embeddings with a set of Von-Mises-Fisher distributions to learn the consistent representations. Besides, taking advantage of position information within the 3D data, we select a set of $K$ Maximum Vertical Neighbors (KMVN) points from the point cloud of each object pair, which encodes the relative position of these two objects. Finally, the features of multi-view 2D and 3D data are concatenated to predict the pairwise relationship of objects. Experimental results on the challenging REGRAD dataset show that MMRDN outperforms the state-of-the-art methods in multi-view MRD tasks. The results also demonstrate that our model trained by synthetic data is capable to transfer to real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12592v1-abstract-full').style.display = 'none'; document.getElementById('2304.12592v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01171">arXiv:2304.01171</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.01171">pdf</a>, <a href="https://arxiv.org/format/2304.01171">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Context Aggregation for Image Matting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qinglin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+X">Xiaoqian Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Q">Quanling Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zonglin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shengping Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+L">Liqiang Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01171v2-abstract-short" style="display: inline;"> Traditional studies emphasize the significance of context information in improving matting performance. Consequently, deep learning-based matting methods delve into designing pooling or affinity-based context aggregation modules to achieve superior results. However, these modules cannot well handle the context scale shift caused by the difference in image size during training and inference, result&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01171v2-abstract-full').style.display = 'inline'; document.getElementById('2304.01171v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01171v2-abstract-full" style="display: none;"> Traditional studies emphasize the significance of context information in improving matting performance. Consequently, deep learning-based matting methods delve into designing pooling or affinity-based context aggregation modules to achieve superior results. However, these modules cannot well handle the context scale shift caused by the difference in image size during training and inference, resulting in matting performance degradation. In this paper, we revisit the context aggregation mechanisms of matting networks and find that a basic encoder-decoder network without any context aggregation modules can actually learn more universal context aggregation, thereby achieving higher matting performance compared to existing methods. Building on this insight, we present AEMatter, a matting network that is straightforward yet very effective. AEMatter adopts a Hybrid-Transformer backbone with appearance-enhanced axis-wise learning (AEAL) blocks to build a basic network with strong context aggregation learning capability. Furthermore, AEMatter leverages a large image training strategy to assist the network in learning context aggregation from data. Extensive experiments on five popular matting datasets demonstrate that the proposed AEMatter outperforms state-of-the-art matting methods by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01171v2-abstract-full').style.display = 'none'; document.getElementById('2304.01171v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.17408">arXiv:2303.17408</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.17408">pdf</a>, <a href="https://arxiv.org/format/2303.17408">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> P-Transformer: A Prompt-based Multimodal Transformer Architecture For Medical Tabular Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+Y">Yucheng Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+D+J">Daniel J. Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Abdullah%2C+H+R">Hairil Rizal Abdullah</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Mengling Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17408v3-abstract-short" style="display: inline;"> Medical tabular data, abundant in Electronic Health Records (EHRs), is a valuable resource for diverse medical tasks such as risk prediction. While deep learning approaches, particularly transformer-based models, have shown remarkable performance in tabular data prediction, there are still problems remained for existing work to be effectively adapted into medical domain, such as under-utilization&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17408v3-abstract-full').style.display = 'inline'; document.getElementById('2303.17408v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17408v3-abstract-full" style="display: none;"> Medical tabular data, abundant in Electronic Health Records (EHRs), is a valuable resource for diverse medical tasks such as risk prediction. While deep learning approaches, particularly transformer-based models, have shown remarkable performance in tabular data prediction, there are still problems remained for existing work to be effectively adapted into medical domain, such as under-utilization of unstructured free-texts, limited exploration of textual information in structured data, and data corruption. To address these issues, we propose P-Transformer, a Prompt-based multimodal Transformer architecture designed specifically for medical tabular data. This framework consists two critical components: a tabular cell embedding generator and a tabular transformer. The former efficiently encodes diverse modalities from both structured and unstructured tabular data into a harmonized language semantic space with the help of pre-trained sentence encoder and medical prompts. The latter integrates cell representations to generate patient embeddings for various medical tasks. In comprehensive experiments on two real-world datasets for three medical tasks, P-Transformer demonstrated the improvements with 10.9%/11.0% on RMSE/MAE, 0.5%/2.2% on RMSE/MAE, and 1.6%/0.8% on BACC/AUROC compared to state-of-the-art (SOTA) baselines in predictability. Notably, the model exhibited strong resilience to data corruption in the structured data, particularly when the corruption rates are high. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17408v3-abstract-full').style.display = 'none'; document.getElementById('2303.17408v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.07828">arXiv:2303.07828</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.07828">pdf</a>, <a href="https://arxiv.org/format/2303.07828">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Prioritized Planning for Target-Oriented Manipulation via Hierarchical Stacking Relationship Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zewen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chengzhong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+N">Nanning Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.07828v2-abstract-short" style="display: inline;"> In scenarios involving the grasping of multiple targets, the learning of stacking relationships between objects is fundamental for robots to execute safely and efficiently. However, current methods lack subdivision for the hierarchy of stacking relationship types. In scenes where objects are mostly stacked in an orderly manner, they are incapable of performing human-like and high-efficient graspin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07828v2-abstract-full').style.display = 'inline'; document.getElementById('2303.07828v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.07828v2-abstract-full" style="display: none;"> In scenarios involving the grasping of multiple targets, the learning of stacking relationships between objects is fundamental for robots to execute safely and efficiently. However, current methods lack subdivision for the hierarchy of stacking relationship types. In scenes where objects are mostly stacked in an orderly manner, they are incapable of performing human-like and high-efficient grasping decisions. This paper proposes a perception-planning method to distinguish different stacking types between objects and generate prioritized manipulation order decisions based on given target designations. We utilize a Hierarchical Stacking Relationship Network (HSRN) to discriminate the hierarchy of stacking and generate a refined Stacking Relationship Tree (SRT) for relationship description. Considering that objects with high stacking stability can be grasped together if necessary, we introduce an elaborate decision-making planner based on the Partially Observable Markov Decision Process (POMDP), which leverages observations and generates the least grasp-consuming decision chain with robustness and is suitable for simultaneously specifying multiple targets. To verify our work, we set the scene to the dining table and augment the REGRAD dataset with a set of common tableware models for network training. Experiments show that our method effectively generates grasping decisions that conform to human requirements, and improves the implementation efficiency compared with existing methods on the basis of guaranteeing the success rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07828v2-abstract-full').style.display = 'none'; document.getElementById('2303.07828v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures. Accepted by 2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.03357">arXiv:2302.03357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.03357">pdf</a>, <a href="https://arxiv.org/format/2302.03357">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Enhancing Time Series Contrastive Learning: A Dynamic Bad Pair Mining Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xiang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+H">Hanshu Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+S">Shenda Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Mengling Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.03357v3-abstract-short" style="display: inline;"> Not all positive pairs are beneficial to time series contrastive learning. In this paper, we study two types of bad positive pairs that can impair the quality of time series representation learned through contrastive learning: the noisy positive pair and the faulty positive pair. We observe that, with the presence of noisy positive pairs, the model tends to simply learn the pattern of noise (Noisy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03357v3-abstract-full').style.display = 'inline'; document.getElementById('2302.03357v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.03357v3-abstract-full" style="display: none;"> Not all positive pairs are beneficial to time series contrastive learning. In this paper, we study two types of bad positive pairs that can impair the quality of time series representation learned through contrastive learning: the noisy positive pair and the faulty positive pair. We observe that, with the presence of noisy positive pairs, the model tends to simply learn the pattern of noise (Noisy Alignment). Meanwhile, when faulty positive pairs arise, the model wastes considerable amount of effort aligning non-representative patterns (Faulty Alignment). To address this problem, we propose a Dynamic Bad Pair Mining (DBPM) algorithm, which reliably identifies and suppresses bad positive pairs in time series contrastive learning. Specifically, DBPM utilizes a memory module to dynamically track the training behavior of each positive pair along training process. This allows us to identify potential bad positive pairs at each epoch based on their historical training behaviors. The identified bad pairs are subsequently down-weighted through a transformation module, thereby mitigating their negative impact on the representation learning process. DBPM is a simple algorithm designed as a lightweight plug-in without learnable parameters to enhance the performance of existing state-of-the-art methods. Through extensive experiments conducted on four large-scale, real-world time series datasets, we demonstrate DBPM&#39;s efficacy in mitigating the adverse effects of bad positive pairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03357v3-abstract-full').style.display = 'none'; document.getElementById('2302.03357v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024 Camera Ready (https://openreview.net/pdf?id=K2c04ulKXn)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12075">arXiv:2211.12075</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.12075">pdf</a>, <a href="https://arxiv.org/format/2211.12075">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Greedy based Value Representation for Optimal Coordination in Multi-agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+L">Lipeng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeyang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xuguang Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+N">Nanning Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12075v1-abstract-short" style="display: inline;"> Due to the representation limitation of the joint Q value function, multi-agent reinforcement learning methods with linear value decomposition (LVD) or monotonic value decomposition (MVD) suffer from relative overgeneralization. As a result, they can not ensure optimal consistency (i.e., the correspondence between individual greedy actions and the maximal true Q value). In this paper, we derive th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12075v1-abstract-full').style.display = 'inline'; document.getElementById('2211.12075v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12075v1-abstract-full" style="display: none;"> Due to the representation limitation of the joint Q value function, multi-agent reinforcement learning methods with linear value decomposition (LVD) or monotonic value decomposition (MVD) suffer from relative overgeneralization. As a result, they can not ensure optimal consistency (i.e., the correspondence between individual greedy actions and the maximal true Q value). In this paper, we derive the expression of the joint Q value function of LVD and MVD. According to the expression, we draw a transition diagram, where each self-transition node (STN) is a possible convergence. To ensure optimal consistency, the optimal node is required to be the unique STN. Therefore, we propose the greedy-based value representation (GVR), which turns the optimal node into an STN via inferior target shaping and further eliminates the non-optimal STNs via superior experience replay. In addition, GVR achieves an adaptive trade-off between optimality and stability. Our method outperforms state-of-the-art baselines in experiments on various benchmarks. Theoretical proofs and empirical results on matrix games demonstrate that GVR ensures optimal consistency under sufficient exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12075v1-abstract-full').style.display = 'none'; document.getElementById('2211.12075v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2112.04454</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03296">arXiv:2211.03296</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.03296">pdf</a>, <a href="https://arxiv.org/format/2211.03296">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Chart Excites Me! Exploring How Data Visualization Design Influences Affective Arousal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xingyu Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yanqiu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+N">Nan Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03296v1-abstract-short" style="display: inline;"> As data visualizations have been increasingly applied in mass communication, designers often seek to grasp viewers immediately and motivate them to read more. Such goals, as suggested by previous research, are closely associated with the activation of emotion, namely affective arousal. Given this motivation, this work takes initial steps toward understanding the arousal-related factors in data vis&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03296v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03296v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03296v1-abstract-full" style="display: none;"> As data visualizations have been increasingly applied in mass communication, designers often seek to grasp viewers immediately and motivate them to read more. Such goals, as suggested by previous research, are closely associated with the activation of emotion, namely affective arousal. Given this motivation, this work takes initial steps toward understanding the arousal-related factors in data visualization design. We collected a corpus of 265 data visualizations and conducted a crowdsourcing study with 184 participants during which the participants were asked to rate the affective arousal elicited by data visualization design (all texts were blurred to exclude the influence of semantics) and provide their reasons. Based on the collected data, first, we identified a set of arousal-related design features by analyzing user comments qualitatively. Then, we mapped these features to computable variables and constructed regression models to infer which features are significant contributors to affective arousal quantitatively. Through this exploratory study, we finally identified four design features (e.g., colorfulness, the number of different visual channels) cross-validated as important features correlated with affective arousal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03296v1-abstract-full').style.display = 'none'; document.getElementById('2211.03296v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Lan%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10