Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 61 results for author: <span class="mathjax">Shu, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shu%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shu, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shu%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shu, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shu%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shu%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shu%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11844">arXiv:2411.11844</a> <span> [<a href="https://arxiv.org/pdf/2411.11844">pdf</a>, <a href="https://arxiv.org/format/2411.11844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Generative World Explorer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+T">Taiming Lu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Khashabi%2C+D">Daniel Khashabi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieneng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11844v2-abstract-short" style="display: inline;"> Planning with partial observation is a central challenge in embodied AI. A majority of prior works have tackled this challenge by developing agents that physically explore their environment to update their beliefs about the world state. In contrast, humans can $\textit{imagine}$ unseen parts of the world through a mental exploration and $\textit{revise}$ their beliefs with imagined observations. S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11844v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11844v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11844v2-abstract-full" style="display: none;"> Planning with partial observation is a central challenge in embodied AI. A majority of prior works have tackled this challenge by developing agents that physically explore their environment to update their beliefs about the world state. In contrast, humans can $\textit{imagine}$ unseen parts of the world through a mental exploration and $\textit{revise}$ their beliefs with imagined observations. Such updated beliefs can allow them to make more informed decisions, without necessitating the physical exploration of the world at all times. To achieve this human-like ability, we introduce the $\textit{Generative World Explorer (Genex)}$, an egocentric world exploration framework that allows an agent to mentally explore a large-scale 3D world (e.g., urban scenes) and acquire imagined observations to update its belief. This updated belief will then help the agent to make a more informed decision at the current step. To train $\textit{Genex}$, we create a synthetic urban scene dataset, Genex-DB. Our experimental results demonstrate that (1) $\textit{Genex}$ can generate high-quality and consistent observations during long-horizon exploration of a large virtual physical world and (2) the beliefs updated with the generated observations can inform an existing decision-making model (e.g., an LLM agent) to make better plans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11844v2-abstract-full').style.display = 'none'; document.getElementById('2411.11844v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: generative-world-explorer.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04987">arXiv:2411.04987</a> <span> [<a href="https://arxiv.org/pdf/2411.04987">pdf</a>, <a href="https://arxiv.org/format/2411.04987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Task Learning through Inverse Generative Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Netanyahu%2C+A">Aviv Netanyahu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yilun Du</a>, <a href="/search/cs?searchtype=author&query=Bronars%2C+A">Antonia Bronars</a>, <a href="/search/cs?searchtype=author&query=Pari%2C+J">Jyothish Pari</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J">Joshua Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+P">Pulkit Agrawal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04987v1-abstract-short" style="display: inline;"> Learning the intents of an agent, defined by its goals or motion style, is often extremely challenging from just a few examples. We refer to this problem as task concept learning and present our approach, Few-Shot Task Learning through Inverse Generative Modeling (FTL-IGM), which learns new task concepts by leveraging invertible neural generative models. The core idea is to pretrain a generative m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04987v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04987v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04987v1-abstract-full" style="display: none;"> Learning the intents of an agent, defined by its goals or motion style, is often extremely challenging from just a few examples. We refer to this problem as task concept learning and present our approach, Few-Shot Task Learning through Inverse Generative Modeling (FTL-IGM), which learns new task concepts by leveraging invertible neural generative models. The core idea is to pretrain a generative model on a set of basic concepts and their demonstrations. Then, given a few demonstrations of a new concept (such as a new goal or a new action), our method learns the underlying concepts through backpropagation without updating the model weights, thanks to the invertibility of the generative model. We evaluate our method in five domains -- object rearrangement, goal-oriented navigation, motion caption of human actions, autonomous driving, and real-world table-top manipulation. Our experimental results demonstrate that via the pretrained generative model, we successfully learn novel concepts and generate agent plans or motion corresponding to these concepts in (1) unseen environments and (2) in composition with training concepts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04987v1-abstract-full').style.display = 'none'; document.getElementById('2411.04987v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01796">arXiv:2411.01796</a> <span> [<a href="https://arxiv.org/pdf/2411.01796">pdf</a>, <a href="https://arxiv.org/format/2411.01796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Constrained Human-AI Cooperation: An Inclusive Embodied Social Intelligence Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+W">Weihua Du</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Q">Qiushi Lyu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jiaming Shan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhenting Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sunli Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+A">Andi Peng</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kwonjoon Lee</a>, <a href="/search/cs?searchtype=author&query=Dariush%2C+B">Behzad Dariush</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01796v2-abstract-short" style="display: inline;"> We introduce Constrained Human-AI Cooperation (CHAIC), an inclusive embodied social intelligence challenge designed to test social perception and cooperation in embodied agents. In CHAIC, the goal is for an embodied agent equipped with egocentric observations to assist a human who may be operating under physical constraints -- e.g., unable to reach high places or confined to a wheelchair -- in per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01796v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01796v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01796v2-abstract-full" style="display: none;"> We introduce Constrained Human-AI Cooperation (CHAIC), an inclusive embodied social intelligence challenge designed to test social perception and cooperation in embodied agents. In CHAIC, the goal is for an embodied agent equipped with egocentric observations to assist a human who may be operating under physical constraints -- e.g., unable to reach high places or confined to a wheelchair -- in performing common household or outdoor tasks as efficiently as possible. To achieve this, a successful helper must: (1) infer the human's intents and constraints by following the human and observing their behaviors (social perception), and (2) make a cooperative plan tailored to the human partner to solve the task as quickly as possible, working together as a team (cooperative planning). To benchmark this challenge, we create four new agents with real physical constraints and eight long-horizon tasks featuring both indoor and outdoor scenes with various constraints, emergency events, and potential risks. We benchmark planning- and learning-based baselines on the challenge and introduce a new method that leverages large language models and behavior modeling. Empirical evaluations demonstrate the effectiveness of our benchmark in enabling systematic assessment of key aspects of machine social intelligence. Our benchmark and code are publicly available at https://github.com/UMass-Foundation-Model/CHAIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01796v2-abstract-full').style.display = 'none'; document.getElementById('2411.01796v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Dataset and Benchmark Track. The first two authors contributed equally. Project Website at https://vis-www.cs.umass.edu/CHAIC/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10260">arXiv:2410.10260</a> <span> [<a href="https://arxiv.org/pdf/2410.10260">pdf</a>, <a href="https://arxiv.org/format/2410.10260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Slide-based Graph Collaborative Training for Histopathology Whole Slide Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jun Shi</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tong Shu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiguo Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haibo Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yushan Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10260v1-abstract-short" style="display: inline;"> The development of computational pathology lies in the consensus that pathological characteristics of tumors are significant guidance for cancer diagnostics. Most existing research focuses on the inner-contextual information within each WSI yet ignores the possible inter-correlations between slides. As the development of tumors is a continuous process involving a series of histological, morphologi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10260v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10260v1-abstract-full" style="display: none;"> The development of computational pathology lies in the consensus that pathological characteristics of tumors are significant guidance for cancer diagnostics. Most existing research focuses on the inner-contextual information within each WSI yet ignores the possible inter-correlations between slides. As the development of tumors is a continuous process involving a series of histological, morphological, and genetic changes that accumulate over time, the similarities and differences between WSIs across various stages, grades, locations and patients should potentially contribute to the representation of WSIs and deserve to be taken into account in WSI modeling. To verify the advancement of introducing the slide inter-correlations into the representation learning of WSIs, we proposed a generic WSI analysis pipeline SlideGCD that can be adapted to any existing Multiple Instance Learning (MIL) frameworks and improve their performance. With the new paradigm, the prior knowledge of cancer development can participate in the end-to-end workflow, which concurrently initializes and refines the slide representation, as a guide for message passing in the slide-based graph. Extensive comparisons and experiments are conducted to validate the effectiveness and robustness of the proposed pipeline across 4 different tasks, including cancer subtyping, cancer staging, survival prediction, and gene mutation prediction, with 7 representative SOTA WSI analysis frameworks as backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10260v1-abstract-full').style.display = 'none'; document.getElementById('2410.10260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10849">arXiv:2409.10849</a> <span> [<a href="https://arxiv.org/pdf/2409.10849">pdf</a>, <a href="https://arxiv.org/format/2409.10849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> SIFToM: Robust Spoken Instruction Following through Theory of Mind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ying%2C+L">Lance Ying</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J+X">Jason Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Aarya%2C+S">Shivam Aarya</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yizirui Fang</a>, <a href="/search/cs?searchtype=author&query=Tellex%2C+S">Stefanie Tellex</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10849v1-abstract-short" style="display: inline;"> Spoken language instructions are ubiquitous in agent collaboration. However, in human-robot collaboration, recognition accuracy for human speech is often influenced by various speech and environmental factors, such as background noise, the speaker's accents, and mispronunciation. When faced with noisy or unfamiliar auditory inputs, humans use context and prior knowledge to disambiguate the stimulu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10849v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10849v1-abstract-full" style="display: none;"> Spoken language instructions are ubiquitous in agent collaboration. However, in human-robot collaboration, recognition accuracy for human speech is often influenced by various speech and environmental factors, such as background noise, the speaker's accents, and mispronunciation. When faced with noisy or unfamiliar auditory inputs, humans use context and prior knowledge to disambiguate the stimulus and take pragmatic actions, a process referred to as top-down processing in cognitive science. We present a cognitively inspired model, Speech Instruction Following through Theory of Mind (SIFToM), to enable robots to pragmatically follow human instructions under diverse speech conditions by inferring the human's goal and joint plan as prior for speech perception and understanding. We test SIFToM in simulated home experiments (VirtualHome 2). Results show that the SIFToM model outperforms state-of-the-art speech and language models, approaching human-level accuracy on challenging speech instruction following tasks. We then demonstrate its ability at the task planning level on a mobile manipulator for breakfast preparation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10849v1-abstract-full').style.display = 'none'; document.getElementById('2409.10849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12574">arXiv:2408.12574</a> <span> [<a href="https://arxiv.org/pdf/2408.12574">pdf</a>, <a href="https://arxiv.org/format/2408.12574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MuMA-ToM: Multi-modal Multi-Agent Theory of Mind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haojun Shi</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Suyu Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Isik%2C+L">Leyla Isik</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12574v2-abstract-short" style="display: inline;"> Understanding people's social interactions in complex real-world scenarios often relies on intricate mental reasoning. To truly understand how and why people interact with one another, we must infer the underlying mental states that give rise to the social interactions, i.e., Theory of Mind reasoning in multi-agent interactions. Additionally, social interactions are often multi-modal -- we can wat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12574v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12574v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12574v2-abstract-full" style="display: none;"> Understanding people's social interactions in complex real-world scenarios often relies on intricate mental reasoning. To truly understand how and why people interact with one another, we must infer the underlying mental states that give rise to the social interactions, i.e., Theory of Mind reasoning in multi-agent interactions. Additionally, social interactions are often multi-modal -- we can watch people's actions, hear their conversations, and/or read about their past behaviors. For AI systems to successfully and safely interact with people in real-world environments, they also need to understand people's mental states as well as their inferences about each other's mental states based on multi-modal information about their interactions. For this, we introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide video and text descriptions of people's multi-modal behavior in realistic household environments. Based on the context, we then ask questions about people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM in a human experiment and provided a human baseline. We also proposed a novel multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse Multi-agent Planning). Our experimental results show that LIMP significantly outperforms state-of-the-art methods, including large multi-modal models (e.g., GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12574v2-abstract-full').style.display = 'none'; document.getElementById('2408.12574v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: https://github.com/SCAI-JHU/MuMA-ToM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08968">arXiv:2407.08968</a> <span> [<a href="https://arxiv.org/pdf/2407.08968">pdf</a>, <a href="https://arxiv.org/format/2407.08968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SlideGCD: Slide-based Graph Collaborative Training with Knowledge Distillation for Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tong Shu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jun Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Dongdong Sun</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiguo Jiang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yushan Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08968v2-abstract-short" style="display: inline;"> Existing WSI analysis methods lie on the consensus that histopathological characteristics of tumors are significant guidance for cancer diagnostics. Particularly, as the evolution of cancers is a continuous process, the correlations and differences across various stages, anatomical locations and patients should be taken into account. However, recent research mainly focuses on the inner-contextual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08968v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08968v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08968v2-abstract-full" style="display: none;"> Existing WSI analysis methods lie on the consensus that histopathological characteristics of tumors are significant guidance for cancer diagnostics. Particularly, as the evolution of cancers is a continuous process, the correlations and differences across various stages, anatomical locations and patients should be taken into account. However, recent research mainly focuses on the inner-contextual information in a single WSI, ignoring the correlations between slides. To verify whether introducing the slide inter-correlations can bring improvements to WSI representation learning, we propose a generic WSI analysis pipeline SlideGCD that considers the existing multi-instance learning (MIL) methods as the backbone and forge the WSI classification task as a node classification problem. More specifically, SlideGCD declares a node buffer that stores previous slide embeddings for subsequent extensive slide-based graph construction and conducts graph learning to explore the inter-correlations implied in the slide-based graph. Moreover, we frame the MIL classifier and graph learning into two parallel workflows and deploy the knowledge distillation to transfer the differentiable information to the graph neural network. The consistent performance boosting, brought by SlideGCD, of four previous state-of-the-art MIL methods is observed on two TCGA benchmark datasets. The code is available at https://github.com/HFUT-miaLab/SlideGCD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08968v2-abstract-full').style.display = 'none'; document.getElementById('2407.08968v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18924">arXiv:2406.18924</a> <span> [<a href="https://arxiv.org/pdf/2406.18924">pdf</a>, <a href="https://arxiv.org/format/2406.18924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Pareto Set for Multi-Objective Continuous Robot Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+K">Ke Shang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Cheng Gong</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yang Nan</a>, <a href="/search/cs?searchtype=author&query=Ishibuchi%2C+H">Hisao Ishibuchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18924v1-abstract-short" style="display: inline;"> For a control problem with multiple conflicting objectives, there exists a set of Pareto-optimal policies called the Pareto set instead of a single optimal policy. When a multi-objective control problem is continuous and complex, traditional multi-objective reinforcement learning (MORL) algorithms search for many Pareto-optimal deep policies to approximate the Pareto set, which is quite resource-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18924v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18924v1-abstract-full" style="display: none;"> For a control problem with multiple conflicting objectives, there exists a set of Pareto-optimal policies called the Pareto set instead of a single optimal policy. When a multi-objective control problem is continuous and complex, traditional multi-objective reinforcement learning (MORL) algorithms search for many Pareto-optimal deep policies to approximate the Pareto set, which is quite resource-consuming. In this paper, we propose a simple and resource-efficient MORL algorithm that learns a continuous representation of the Pareto set in a high-dimensional policy parameter space using a single hypernet. The learned hypernet can directly generate various well-trained policy networks for different user preferences. We compare our method with two state-of-the-art MORL algorithms on seven multi-objective continuous robot control problems. Experimental results show that our method achieves the best overall performance with the least training parameters. An interesting observation is that the Pareto set is well approximated by a curved line or surface in a high-dimensional parameter space. This observation will provide insight for researchers to design new MORL algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18924v1-abstract-full').style.display = 'none'; document.getElementById('2406.18924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14769">arXiv:2405.14769</a> <span> [<a href="https://arxiv.org/pdf/2405.14769">pdf</a>, <a href="https://arxiv.org/format/2405.14769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pragmatic Feature Preferences: Learning Reward-Relevant Preferences from Human Input </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+A">Andi Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuying Sun</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Abel%2C+D">David Abel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14769v1-abstract-short" style="display: inline;"> Humans use social context to specify preferences over behaviors, i.e. their reward functions. Yet, algorithms for inferring reward models from preference data do not take this social learning view into account. Inspired by pragmatic human communication, we study how to extract fine-grained data regarding why an example is preferred that is useful for learning more accurate reward models. We propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14769v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14769v1-abstract-full" style="display: none;"> Humans use social context to specify preferences over behaviors, i.e. their reward functions. Yet, algorithms for inferring reward models from preference data do not take this social learning view into account. Inspired by pragmatic human communication, we study how to extract fine-grained data regarding why an example is preferred that is useful for learning more accurate reward models. We propose to enrich binary preference queries to ask both (1) which features of a given example are preferable in addition to (2) comparisons between examples themselves. We derive an approach for learning from these feature-level preferences, both for cases where users specify which features are reward-relevant, and when users do not. We evaluate our approach on linear bandit settings in both vision- and language-based domains. Results support the efficiency of our approach in quickly converging to accurate rewards with fewer comparisons vs. example-only labels. Finally, we validate the real-world applicability with a behavioral experiment on a mushroom foraging task. Our findings suggest that incorporating pragmatic feature preferences is a promising approach for more efficient user-aligned reward learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14769v1-abstract-full').style.display = 'none'; document.getElementById('2405.14769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10775">arXiv:2404.10775</a> <span> [<a href="https://arxiv.org/pdf/2404.10775">pdf</a>, <a href="https://arxiv.org/format/2404.10775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> COMBO: Compositional World Models for Embodied Multi-Agent Cooperation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Q">Qiushi Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sunli Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yilun Du</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10775v1-abstract-short" style="display: inline;"> In this paper, we investigate the problem of embodied multi-agent cooperation, where decentralized agents must cooperate given only partial egocentric views of the world. To effectively plan in this setting, in contrast to learning world dynamics in a single-agent scenario, we must simulate world dynamics conditioned on an arbitrary number of agents' actions given only partial egocentric visual ob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10775v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10775v1-abstract-full" style="display: none;"> In this paper, we investigate the problem of embodied multi-agent cooperation, where decentralized agents must cooperate given only partial egocentric views of the world. To effectively plan in this setting, in contrast to learning world dynamics in a single-agent scenario, we must simulate world dynamics conditioned on an arbitrary number of agents' actions given only partial egocentric visual observations of the world. To address this issue of partial observability, we first train generative models to estimate the overall world state given partial egocentric observations. To enable accurate simulation of multiple sets of actions on this world state, we then propose to learn a compositional world model for multi-agent cooperation by factorizing the naturally composable joint actions of multiple agents and compositionally generating the video. By leveraging this compositional world model, in combination with Vision Language Models to infer the actions of other agents, we can use a tree search procedure to integrate these modules and facilitate online cooperative planning. To evaluate the efficacy of our methods, we create two challenging embodied multi-agent long-horizon cooperation tasks using the ThreeDWorld simulator and conduct experiments with 2-4 agents. The results show our compositional world model is effective and the framework enables the embodied agents to cooperate efficiently with different agents across various tasks and an arbitrary number of agents, showing the promising future of our proposed framework. More videos can be found at https://vis-www.cs.umass.edu/combo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10775v1-abstract-full').style.display = 'none'; document.getElementById('2404.10775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages. The first three authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00282">arXiv:2404.00282</a> <span> [<a href="https://arxiv.org/pdf/2404.00282">pdf</a>, <a href="https://arxiv.org/format/2404.00282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNNLS.2024.3497992">10.1109/TNNLS.2024.3497992 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Survey on Large Language Model-Enhanced Reinforcement Learning: Concept, Taxonomy, and Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuji Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuheng Cheng</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Ting Shu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guolong Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+G">Gaoqi Liang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junhua Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jinyue Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00282v3-abstract-short" style="display: inline;"> With extensive pre-trained knowledge and high-level general capabilities, large language models (LLMs) emerge as a promising avenue to augment reinforcement learning (RL) in aspects such as multi-task learning, sample efficiency, and high-level task planning. In this survey, we provide a comprehensive review of the existing literature in LLM-enhanced RL and summarize its characteristics compared t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00282v3-abstract-full').style.display = 'inline'; document.getElementById('2404.00282v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00282v3-abstract-full" style="display: none;"> With extensive pre-trained knowledge and high-level general capabilities, large language models (LLMs) emerge as a promising avenue to augment reinforcement learning (RL) in aspects such as multi-task learning, sample efficiency, and high-level task planning. In this survey, we provide a comprehensive review of the existing literature in LLM-enhanced RL and summarize its characteristics compared to conventional RL methods, aiming to clarify the research scope and directions for future studies. Utilizing the classical agent-environment interaction paradigm, we propose a structured taxonomy to systematically categorize LLMs' functionalities in RL, including four roles: information processor, reward designer, decision-maker, and generator. For each role, we summarize the methodologies, analyze the specific RL challenges that are mitigated, and provide insights into future directions. Lastly, a comparative analysis of each role, potential applications, prospective opportunities, and challenges of the LLM-enhanced RL are discussed. By proposing this taxonomy, we aim to provide a framework for researchers to effectively leverage LLMs in the RL field, potentially accelerating RL applications in complex applications such as robotics, autonomous driving, and energy systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00282v3-abstract-full').style.display = 'none'; document.getElementById('2404.00282v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages (including bibliography), 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11075">arXiv:2403.11075</a> <span> [<a href="https://arxiv.org/pdf/2403.11075">pdf</a>, <a href="https://arxiv.org/format/2403.11075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> GOMA: Proactive Embodied Cooperative Communication via Goal-Oriented Mental Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ying%2C+L">Lance Ying</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+K">Kunal Jha</a>, <a href="/search/cs?searchtype=author&query=Aarya%2C+S">Shivam Aarya</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11075v1-abstract-short" style="display: inline;"> Verbal communication plays a crucial role in human cooperation, particularly when the partners only have incomplete information about the task, environment, and each other's mental state. In this paper, we propose a novel cooperative communication framework, Goal-Oriented Mental Alignment (GOMA). GOMA formulates verbal communication as a planning problem that minimizes the misalignment between the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11075v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11075v1-abstract-full" style="display: none;"> Verbal communication plays a crucial role in human cooperation, particularly when the partners only have incomplete information about the task, environment, and each other's mental state. In this paper, we propose a novel cooperative communication framework, Goal-Oriented Mental Alignment (GOMA). GOMA formulates verbal communication as a planning problem that minimizes the misalignment between the parts of agents' mental states that are relevant to the goals. This approach enables an embodied assistant to reason about when and how to proactively initialize communication with humans verbally using natural language to help achieve better cooperation. We evaluate our approach against strong baselines in two challenging environments, Overcooked (a multiplayer game) and VirtualHome (a household simulator). Our experimental results demonstrate that large language models struggle with generating meaningful communication that is grounded in the social and physical context. In contrast, our approach can successfully generate concise verbal communication for the embodied assistant to effectively boost the performance of the cooperation as well as human users' perception of the assistant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11075v1-abstract-full').style.display = 'none'; document.getElementById('2403.11075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08743">arXiv:2401.08743</a> <span> [<a href="https://arxiv.org/pdf/2401.08743">pdf</a>, <a href="https://arxiv.org/format/2401.08743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MMToM-QA: Multimodal Theory of Mind Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yutong Wu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jing Cao</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jiannan Xiang</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a>, <a href="/search/cs?searchtype=author&query=Ullman%2C+T">Tomer Ullman</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08743v2-abstract-short" style="display: inline;"> Theory of Mind (ToM), the ability to understand people's mental states, is an essential ingredient for developing machines with human-level social intelligence. Recent machine learning models, particularly large language models, seem to show some aspects of ToM understanding. However, existing ToM benchmarks use unimodal datasets - either video or text. Human ToM, on the other hand, is more than v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08743v2-abstract-full').style.display = 'inline'; document.getElementById('2401.08743v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08743v2-abstract-full" style="display: none;"> Theory of Mind (ToM), the ability to understand people's mental states, is an essential ingredient for developing machines with human-level social intelligence. Recent machine learning models, particularly large language models, seem to show some aspects of ToM understanding. However, existing ToM benchmarks use unimodal datasets - either video or text. Human ToM, on the other hand, is more than video or text understanding. People can flexibly reason about another person's mind based on conceptual representations (e.g., goals, beliefs, plans) extracted from any available data. To address this, we introduce a multimodal Theory of Mind question answering (MMToM-QA) benchmark. MMToM-QA comprehensively evaluates machine ToM both on multimodal data and on different kinds of unimodal data about a person's activity in a household environment. To engineer multimodal ToM capacity, we propose a novel method, BIP-ALM (Bayesian Inverse Planning Accelerated by Language Models). BIP-ALM extracts unified representations from multimodal data and utilizes language models for scalable Bayesian inverse planning. We conducted a systematic comparison of human performance, BIP-ALM, and state-of-the-art models, including GPT-4. The experiments demonstrate that large language models and large multimodal models still lack robust ToM capacity. BIP-ALM, on the other hand, shows promising results, by leveraging the power of both model-based mental inference and language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08743v2-abstract-full').style.display = 'none'; document.getElementById('2401.08743v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024. 26 pages, 11 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01493">arXiv:2401.01493</a> <span> [<a href="https://arxiv.org/pdf/2401.01493">pdf</a>, <a href="https://arxiv.org/format/2401.01493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Free Lunch for Federated Remote Sensing Target Fine-Grained Classification: A Parameter-Efficient Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengchao Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Ting Shu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Sufen Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lina Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01493v1-abstract-short" style="display: inline;"> Remote Sensing Target Fine-grained Classification (TFGC) is of great significance in both military and civilian fields. Due to location differences, growth in data size, and centralized server storage constraints, these data are usually stored under different databases across regions/countries. However, privacy laws and national security concerns constrain researchers from accessing these sensitiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01493v1-abstract-full').style.display = 'inline'; document.getElementById('2401.01493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01493v1-abstract-full" style="display: none;"> Remote Sensing Target Fine-grained Classification (TFGC) is of great significance in both military and civilian fields. Due to location differences, growth in data size, and centralized server storage constraints, these data are usually stored under different databases across regions/countries. However, privacy laws and national security concerns constrain researchers from accessing these sensitive remote sensing images for further analysis. Additionally, low-resource remote sensing devices encounter challenges in terms of communication overhead and efficiency when dealing with the ever-increasing data and model scales. To solve the above challenges, this paper proposes a novel Privacy-Reserving TFGC Framework based on Federated Learning, dubbed PRFL. The proposed framework allows each client to learn global and local knowledge to enhance the local representation of private data in environments with extreme statistical heterogeneity (non. Independent and Identically Distributed, IID). Thus, it provides highly customized models to clients with differentiated data distributions. Moreover, the framework minimizes communication overhead and improves efficiency while ensuring satisfactory performance, thereby enhancing robustness and practical applicability under resource-scarce conditions. We demonstrate the effectiveness of the proposed PRFL on the classical TFGC task by leveraging four public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01493v1-abstract-full').style.display = 'none'; document.getElementById('2401.01493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review, 23 pages, 3 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05230">arXiv:2312.05230</a> <span> [<a href="https://arxiv.org/pdf/2312.05230">pdf</a>, <a href="https://arxiv.org/format/2312.05230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Language Models, Agent Models, and World Models: The LAW for Machine Reasoning and Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05230v1-abstract-short" style="display: inline;"> Despite their tremendous success in many applications, large language models often fall short of consistent reasoning and planning in various (language, embodied, and social) scenarios, due to inherent limitations in their inference, learning, and modeling capabilities. In this position paper, we present a new perspective of machine reasoning, LAW, that connects the concepts of Language models, Ag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05230v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05230v1-abstract-full" style="display: none;"> Despite their tremendous success in many applications, large language models often fall short of consistent reasoning and planning in various (language, embodied, and social) scenarios, due to inherent limitations in their inference, learning, and modeling capabilities. In this position paper, we present a new perspective of machine reasoning, LAW, that connects the concepts of Language models, Agent models, and World models, for more robust and versatile reasoning capabilities. In particular, we propose that world and agent models are a better abstraction of reasoning, that introduces the crucial elements of deliberate human-like reasoning, including beliefs about the world and other agents, anticipation of consequences, goals/rewards, and strategic planning. Crucially, language models in LAW serve as a backend to implement the system or its elements and hence provide the computational power and adaptability. We review the recent studies that have made relevant progress and discuss future research directions towards operationalizing the LAW framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05230v1-abstract-full').style.display = 'none'; document.getElementById('2312.05230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Position paper. Accompanying NeurIPS2023 Tutorial: https://sites.google.com/view/neurips2023law/home</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14242">arXiv:2308.14242</a> <span> [<a href="https://arxiv.org/pdf/2308.14242">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Cultural Psychology of Large Language Models: Is ChatGPT a Holistic or Analytic Thinker? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhihan Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14242v1-abstract-short" style="display: inline;"> The prevalent use of Large Language Models (LLMs) has necessitated studying their mental models, yielding noteworthy theoretical and practical implications. Current research has demonstrated that state-of-the-art LLMs, such as ChatGPT, exhibit certain theory of mind capabilities and possess relatively stable Big Five and/or MBTI personality traits. In addition, cognitive process features form an e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14242v1-abstract-full').style.display = 'inline'; document.getElementById('2308.14242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14242v1-abstract-full" style="display: none;"> The prevalent use of Large Language Models (LLMs) has necessitated studying their mental models, yielding noteworthy theoretical and practical implications. Current research has demonstrated that state-of-the-art LLMs, such as ChatGPT, exhibit certain theory of mind capabilities and possess relatively stable Big Five and/or MBTI personality traits. In addition, cognitive process features form an essential component of these mental models. Research in cultural psychology indicated significant differences in the cognitive processes of Eastern and Western people when processing information and making judgments. While Westerners predominantly exhibit analytical thinking that isolates things from their environment to analyze their nature independently, Easterners often showcase holistic thinking, emphasizing relationships and adopting a global viewpoint. In our research, we probed the cultural cognitive traits of ChatGPT. We employed two scales that directly measure the cognitive process: the Analysis-Holism Scale (AHS) and the Triadic Categorization Task (TCT). Additionally, we used two scales that investigate the value differences shaped by cultural thinking: the Dialectical Self Scale (DSS) and the Self-construal Scale (SCS). In cognitive process tests (AHS/TCT), ChatGPT consistently tends towards Eastern holistic thinking, but regarding value judgments (DSS/SCS), ChatGPT does not significantly lean towards the East or the West. We suggest that the result could be attributed to both the training paradigm and the training data in LLM development. We discuss the potential value of this finding for AI research and directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14242v1-abstract-full').style.display = 'none'; document.getElementById('2308.14242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.11071">arXiv:2308.11071</a> <span> [<a href="https://arxiv.org/pdf/2308.11071">pdf</a>, <a href="https://arxiv.org/format/2308.11071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Neural Amortized Inference for Nested Multi-agent Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jha%2C+K">Kunal Jha</a>, <a href="/search/cs?searchtype=author&query=Le%2C+T+A">Tuan Anh Le</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.11071v1-abstract-short" style="display: inline;"> Multi-agent interactions, such as communication, teaching, and bluffing, often rely on higher-order social inference, i.e., understanding how others infer oneself. Such intricate reasoning can be effectively modeled through nested multi-agent reasoning. Nonetheless, the computational complexity escalates exponentially with each level of reasoning, posing a significant challenge. However, humans ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11071v1-abstract-full').style.display = 'inline'; document.getElementById('2308.11071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.11071v1-abstract-full" style="display: none;"> Multi-agent interactions, such as communication, teaching, and bluffing, often rely on higher-order social inference, i.e., understanding how others infer oneself. Such intricate reasoning can be effectively modeled through nested multi-agent reasoning. Nonetheless, the computational complexity escalates exponentially with each level of reasoning, posing a significant challenge. However, humans effortlessly perform complex social inferences as part of their daily lives. To bridge the gap between human-like inference capabilities and computational limitations, we propose a novel approach: leveraging neural networks to amortize high-order social inference, thereby expediting nested multi-agent reasoning. We evaluate our method in two challenging multi-agent interaction domains. The experimental results demonstrate that our method is computationally efficient while exhibiting minimal degradation in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11071v1-abstract-full').style.display = 'none'; document.getElementById('2308.11071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02242">arXiv:2308.02242</a> <span> [<a href="https://arxiv.org/pdf/2308.02242">pdf</a>, <a href="https://arxiv.org/ps/2308.02242">ps</a>, <a href="https://arxiv.org/format/2308.02242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Countering Eavesdroppers with Meta-learning-based Cooperative Ambient Backscatter Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+N+H">Nam H. Chu</a>, <a href="/search/cs?searchtype=author&query=Van+Huynh%2C+N">Nguyen Van Huynh</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+D+N">Diep N. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+D+T">Dinh Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shimin Gong</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tao Shu</a>, <a href="/search/cs?searchtype=author&query=Dutkiewicz%2C+E">Eryk Dutkiewicz</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+K+T">Khoa T. Phan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02242v1-abstract-short" style="display: inline;"> This article introduces a novel lightweight framework using ambient backscattering communications to counter eavesdroppers. In particular, our framework divides an original message into two parts: (i) the active-transmit message transmitted by the transmitter using conventional RF signals and (ii) the backscatter message transmitted by an ambient backscatter tag that backscatters upon the active s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02242v1-abstract-full').style.display = 'inline'; document.getElementById('2308.02242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02242v1-abstract-full" style="display: none;"> This article introduces a novel lightweight framework using ambient backscattering communications to counter eavesdroppers. In particular, our framework divides an original message into two parts: (i) the active-transmit message transmitted by the transmitter using conventional RF signals and (ii) the backscatter message transmitted by an ambient backscatter tag that backscatters upon the active signals emitted by the transmitter. Notably, the backscatter tag does not generate its own signal, making it difficult for an eavesdropper to detect the backscattered signals unless they have prior knowledge of the system. Here, we assume that without decoding/knowing the backscatter message, the eavesdropper is unable to decode the original message. Even in scenarios where the eavesdropper can capture both messages, reconstructing the original message is a complex task without understanding the intricacies of the message-splitting mechanism. A challenge in our proposed framework is to effectively decode the backscattered signals at the receiver, often accomplished using the maximum likelihood (MLK) approach. However, such a method may require a complex mathematical model together with perfect channel state information (CSI). To address this issue, we develop a novel deep meta-learning-based signal detector that can not only effectively decode the weak backscattered signals without requiring perfect CSI but also quickly adapt to a new wireless environment with very little knowledge. Simulation results show that our proposed learning approach, without requiring perfect CSI and complex mathematical model, can achieve a bit error ratio close to that of the MLK-based approach. They also clearly show the efficiency of the proposed approach in dealing with eavesdropping attacks and the lack of training data for deep learning models in practical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02242v1-abstract-full').style.display = 'none'; document.getElementById('2308.02242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.06333">arXiv:2307.06333</a> <span> [<a href="https://arxiv.org/pdf/2307.06333">pdf</a>, <a href="https://arxiv.org/format/2307.06333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Diagnosis, Feedback, Adaptation: A Human-in-the-Loop Framework for Test-Time Policy Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+A">Andi Peng</a>, <a href="/search/cs?searchtype=author&query=Netanyahu%2C+A">Aviv Netanyahu</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+M">Mark Ho</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Bobu%2C+A">Andreea Bobu</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+J">Julie Shah</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+P">Pulkit Agrawal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.06333v2-abstract-short" style="display: inline;"> Policies often fail due to distribution shift -- changes in the state and reward that occur when a policy is deployed in new environments. Data augmentation can increase robustness by making the model invariant to task-irrelevant changes in the agent's observation. However, designers don't know which concepts are irrelevant a priori, especially when different end users have different preferences a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.06333v2-abstract-full').style.display = 'inline'; document.getElementById('2307.06333v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.06333v2-abstract-full" style="display: none;"> Policies often fail due to distribution shift -- changes in the state and reward that occur when a policy is deployed in new environments. Data augmentation can increase robustness by making the model invariant to task-irrelevant changes in the agent's observation. However, designers don't know which concepts are irrelevant a priori, especially when different end users have different preferences about how the task is performed. We propose an interactive framework to leverage feedback directly from the user to identify personalized task-irrelevant concepts. Our key idea is to generate counterfactual demonstrations that allow users to quickly identify possible task-relevant and irrelevant concepts. The knowledge of task-irrelevant concepts is then used to perform data augmentation and thus obtain a policy adapted to personalized user objectives. We present experiments validating our framework on discrete and continuous control tasks with real human users. Our method (1) enables users to better understand agent failure, (2) reduces the number of demonstrations required for fine-tuning, and (3) aligns the agent to individual user task preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.06333v2-abstract-full').style.display = 'none'; document.getElementById('2307.06333v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">International Conference on Machine Learning (ICML) 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.02485">arXiv:2307.02485</a> <span> [<a href="https://arxiv.org/pdf/2307.02485">pdf</a>, <a href="https://arxiv.org/format/2307.02485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Building Cooperative Embodied Agents Modularly with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Weihua Du</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jiaming Shan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qinhong Zhou</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yilun Du</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.02485v2-abstract-short" style="display: inline;"> In this work, we address challenging multi-agent cooperation problems with decentralized control, raw sensory observations, costly communication, and multi-objective tasks instantiated in various embodied environments. While previous research either presupposes a cost-free communication channel or relies on a centralized controller with shared observations, we harness the commonsense knowledge, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02485v2-abstract-full').style.display = 'inline'; document.getElementById('2307.02485v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.02485v2-abstract-full" style="display: none;"> In this work, we address challenging multi-agent cooperation problems with decentralized control, raw sensory observations, costly communication, and multi-objective tasks instantiated in various embodied environments. While previous research either presupposes a cost-free communication channel or relies on a centralized controller with shared observations, we harness the commonsense knowledge, reasoning ability, language comprehension, and text generation prowess of LLMs and seamlessly incorporate them into a cognitive-inspired modular framework that integrates with perception, memory, and execution. Thus building a Cooperative Embodied Language Agent CoELA, who can plan, communicate, and cooperate with others to accomplish long-horizon tasks efficiently. Our experiments on C-WAH and TDW-MAT demonstrate that CoELA driven by GPT-4 can surpass strong planning-based methods and exhibit emergent effective communication. Though current Open LMs like LLAMA-2 still underperform, we fine-tune a CoELA with data collected with our agents and show how they can achieve promising performance. We also conducted a user study for human-agent interaction and discovered that CoELA communicating in natural language can earn more trust and cooperate more effectively with humans. Our research underscores the potential of LLMs for future research in multi-agent cooperation. Videos can be found on the project website https://vis-www.cs.umass.edu/Co-LLM-Agents/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02485v2-abstract-full').style.display = 'none'; document.getElementById('2307.02485v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR24. The first two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10626">arXiv:2305.10626</a> <span> [<a href="https://arxiv.org/pdf/2305.10626">pdf</a>, <a href="https://arxiv.org/format/2305.10626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Language Models Meet World Models: Embodied Experiences Enhance Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jiannan Xiang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+T">Tianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yi Gu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zirui Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zichao Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10626v3-abstract-short" style="display: inline;"> While large language models (LMs) have shown remarkable capabilities across numerous tasks, they often struggle with simple reasoning and planning in physical environments, such as understanding object permanence or planning household activities. The limitation arises from the fact that LMs are trained only on written text and miss essential embodied knowledge and skills. In this paper, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10626v3-abstract-full').style.display = 'inline'; document.getElementById('2305.10626v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10626v3-abstract-full" style="display: none;"> While large language models (LMs) have shown remarkable capabilities across numerous tasks, they often struggle with simple reasoning and planning in physical environments, such as understanding object permanence or planning household activities. The limitation arises from the fact that LMs are trained only on written text and miss essential embodied knowledge and skills. In this paper, we propose a new paradigm of enhancing LMs by finetuning them with world models, to gain diverse embodied knowledge while retaining their general language capabilities. Our approach deploys an embodied agent in a world model, particularly a simulator of the physical world (VirtualHome), and acquires a diverse set of embodied experiences through both goal-oriented planning and random exploration. These experiences are then used to finetune LMs to teach diverse abilities of reasoning and acting in the physical world, e.g., planning and completing goals, object permanence and tracking, etc. Moreover, it is desirable to preserve the generality of LMs during finetuning, which facilitates generalizing the embodied knowledge across tasks rather than being tied to specific simulations. We thus further introduce the classical (EWC) for selective weight updates, combined with low-rank adapters (LoRA) for training efficiency. Extensive experiments show our approach substantially improves base LMs on 18 downstream tasks by 64.28% on average. In particular, the small LMs (1.3B, 6B, and 13B) enhanced by our approach match or even outperform much larger LMs (e.g., ChatGPT). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10626v3-abstract-full').style.display = 'none'; document.getElementById('2305.10626v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14857">arXiv:2304.14857</a> <span> [<a href="https://arxiv.org/pdf/2304.14857">pdf</a>, <a href="https://arxiv.org/format/2304.14857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.knosys.2023.110881">10.1016/j.knosys.2023.110881 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MASK-CNN-Transformer For Real-Time Multi-Label Weather Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengchao Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Ting Shu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14857v2-abstract-short" style="display: inline;"> Weather recognition is an essential support for many practical life applications, including traffic safety, environment, and meteorology. However, many existing related works cannot comprehensively describe weather conditions due to their complex co-occurrence dependencies. This paper proposes a novel multi-label weather recognition model considering these dependencies. The proposed model called M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14857v2-abstract-full').style.display = 'inline'; document.getElementById('2304.14857v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14857v2-abstract-full" style="display: none;"> Weather recognition is an essential support for many practical life applications, including traffic safety, environment, and meteorology. However, many existing related works cannot comprehensively describe weather conditions due to their complex co-occurrence dependencies. This paper proposes a novel multi-label weather recognition model considering these dependencies. The proposed model called MASK-Convolutional Neural Network-Transformer (MASK-CT) is based on the Transformer, the convolutional process, and the MASK mechanism. The model employs multiple convolutional layers to extract features from weather images and a Transformer encoder to calculate the probability of each weather condition based on the extracted features. To improve the generalization ability of MASK-CT, a MASK mechanism is used during the training phase. The effect of the MASK mechanism is explored and discussed. The Mask mechanism randomly withholds some information from one-pair training instances (one image and its corresponding label). There are two types of MASK methods. Specifically, MASK-I is designed and deployed on the image before feeding it into the weather feature extractor and MASK-II is applied to the image label. The Transformer encoder is then utilized on the randomly masked image features and labels. The experimental results from various real-world weather recognition datasets demonstrate that the proposed MASK-CT model outperforms state-of-the-art methods. Furthermore, the high-speed dynamic real-time weather recognition capability of the MASK-CT is evaluated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14857v2-abstract-full').style.display = 'none'; document.getElementById('2304.14857v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Have been accepted. Appears in Knowledge-Based Systems, see https://www.sciencedirect.com/science/article/pii/S0950705123006317</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Knowledge-Based Systems, 110881 (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14131">arXiv:2304.14131</a> <span> [<a href="https://arxiv.org/pdf/2304.14131">pdf</a>, <a href="https://arxiv.org/format/2304.14131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2023.3311510">10.1109/TGRS.2023.3311510 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TempEE: Temporal-Spatial Parallel Transformer for Radar Echo Extrapolation Beyond Auto-Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengchao Chen</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Ting Shu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+G">Guo Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xunlai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14131v2-abstract-short" style="display: inline;"> Meteorological radar reflectivity data (i.e. radar echo) significantly influences precipitation prediction. It can facilitate accurate and expeditious forecasting of short-term heavy rainfall bypassing the need for complex Numerical Weather Prediction (NWP) models. In comparison to conventional models, Deep Learning (DL)-based radar echo extrapolation algorithms exhibit higher effectiveness and ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14131v2-abstract-full').style.display = 'inline'; document.getElementById('2304.14131v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14131v2-abstract-full" style="display: none;"> Meteorological radar reflectivity data (i.e. radar echo) significantly influences precipitation prediction. It can facilitate accurate and expeditious forecasting of short-term heavy rainfall bypassing the need for complex Numerical Weather Prediction (NWP) models. In comparison to conventional models, Deep Learning (DL)-based radar echo extrapolation algorithms exhibit higher effectiveness and efficiency. Nevertheless, the development of reliable and generalized echo extrapolation algorithm is impeded by three primary challenges: cumulative error spreading, imprecise representation of sparsely distributed echoes, and inaccurate description of non-stationary motion processes. To tackle these challenges, this paper proposes a novel radar echo extrapolation algorithm called Temporal-Spatial Parallel Transformer, referred to as TempEE. TempEE avoids using auto-regression and instead employs a one-step forward strategy to prevent cumulative error spreading during the extrapolation process. Additionally, we propose the incorporation of a Multi-level Temporal-Spatial Attention mechanism to improve the algorithm's capability of capturing both global and local information while emphasizing task-related regions, including sparse echo representations, in an efficient manner. Furthermore, the algorithm extracts spatio-temporal representations from continuous echo images using a parallel encoder to model the non-stationary motion process for echo extrapolation. The superiority of our TempEE has been demonstrated in the context of the classic radar echo extrapolation task, utilizing a real-world dataset. Extensive experiments have further validated the efficacy and indispensability of various components within TempEE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14131v2-abstract-full').style.display = 'none'; document.getElementById('2304.14131v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Have been accepted by IEEE Transactions on Geoscience and Remote Sensing, see https://ieeexplore.ieee.org/document/10238744</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Geoscience and Remote Sensing 61, 5108914 (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.10421">arXiv:2303.10421</a> <span> [<a href="https://arxiv.org/pdf/2303.10421">pdf</a>, <a href="https://arxiv.org/format/2303.10421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mutilmodal Feature Extraction and Attention-based Fusion for Emotion Estimation in Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tao Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinke Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruotong Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chuang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.10421v1-abstract-short" style="display: inline;"> The continuous improvement of human-computer interaction technology makes it possible to compute emotions. In this paper, we introduce our submission to the CVPR 2023 Competition on Affective Behavior Analysis in-the-wild (ABAW). Sentiment analysis in human-computer interaction should, as far as possible Start with multiple dimensions, fill in the single imperfect emotion channel, and finally dete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10421v1-abstract-full').style.display = 'inline'; document.getElementById('2303.10421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.10421v1-abstract-full" style="display: none;"> The continuous improvement of human-computer interaction technology makes it possible to compute emotions. In this paper, we introduce our submission to the CVPR 2023 Competition on Affective Behavior Analysis in-the-wild (ABAW). Sentiment analysis in human-computer interaction should, as far as possible Start with multiple dimensions, fill in the single imperfect emotion channel, and finally determine the emotion tendency by fitting multiple results. Therefore, We exploited multimodal features extracted from video of different lengths from the competition dataset, including audio, pose and images. Well-informed emotion representations drive us to propose a Attention-based multimodal framework for emotion estimation. Our system achieves the performance of 0.361 on the validation dataset. The code is available at [https://github.com/xkwangcn/ABAW-5th-RT-IAI]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10421v1-abstract-full').style.display = 'none'; document.getElementById('2303.10421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.13445">arXiv:2302.13445</a> <span> [<a href="https://arxiv.org/pdf/2302.13445">pdf</a>, <a href="https://arxiv.org/ps/2302.13445">ps</a>, <a href="https://arxiv.org/format/2302.13445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Resource Allocation for Metaverse Applications with Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+N+H">Nam H. Chu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+D+N">Diep N. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+D+T">Dinh Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+K+T">Khoa T. Phan</a>, <a href="/search/cs?searchtype=author&query=Dutkiewicz%2C+E">Eryk Dutkiewicz</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.13445v1-abstract-short" style="display: inline;"> This work proposes a novel framework to dynamically and effectively manage and allocate different types of resources for Metaverse applications, which are forecasted to demand massive resources of various types that have never been seen before. Specifically, by studying functions of Metaverse applications, we first propose an effective solution to divide applications into groups, namely MetaInstan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13445v1-abstract-full').style.display = 'inline'; document.getElementById('2302.13445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.13445v1-abstract-full" style="display: none;"> This work proposes a novel framework to dynamically and effectively manage and allocate different types of resources for Metaverse applications, which are forecasted to demand massive resources of various types that have never been seen before. Specifically, by studying functions of Metaverse applications, we first propose an effective solution to divide applications into groups, namely MetaInstances, where common functions can be shared among applications to enhance resource usage efficiency. Then, to capture the real-time, dynamic, and uncertain characteristics of request arrival and application departure processes, we develop a semi-Markov decision process-based framework and propose an intelligent algorithm that can gradually learn the optimal admission policy to maximize the revenue and resource usage efficiency for the Metaverse service provider and at the same time enhance the Quality-of-Service for Metaverse users. Extensive simulation results show that our proposed approach can achieve up to 120% greater revenue for the Metaverse service providers and up to 178.9% higher acceptance probability for Metaverse application requests than those of other baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13445v1-abstract-full').style.display = 'none'; document.getElementById('2302.13445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in the Proceedings of the IEEE WCNC 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.05223">arXiv:2301.05223</a> <span> [<a href="https://arxiv.org/pdf/2301.05223">pdf</a>, <a href="https://arxiv.org/format/2301.05223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> NOPA: Neurally-guided Online Probabilistic Assistance for Building Socially Intelligent Home Assistants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Puig%2C+X">Xavier Puig</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.05223v1-abstract-short" style="display: inline;"> In this work, we study how to build socially intelligent robots to assist people in their homes. In particular, we focus on assistance with online goal inference, where robots must simultaneously infer humans' goals and how to help them achieve those goals. Prior assistance methods either lack the adaptivity to adjust helping strategies (i.e., when and how to help) in response to uncertainty about… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05223v1-abstract-full').style.display = 'inline'; document.getElementById('2301.05223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.05223v1-abstract-full" style="display: none;"> In this work, we study how to build socially intelligent robots to assist people in their homes. In particular, we focus on assistance with online goal inference, where robots must simultaneously infer humans' goals and how to help them achieve those goals. Prior assistance methods either lack the adaptivity to adjust helping strategies (i.e., when and how to help) in response to uncertainty about goals or the scalability to conduct fast inference in a large goal space. Our NOPA (Neurally-guided Online Probabilistic Assistance) method addresses both of these challenges. NOPA consists of (1) an online goal inference module combining neural goal proposals with inverse planning and particle filtering for robust inference under uncertainty, and (2) a helping planner that discovers valuable subgoals to help with and is aware of the uncertainty in goal inference. We compare NOPA against multiple baselines in a new embodied AI assistance challenge: Online Watch-And-Help, in which a helper agent needs to simultaneously watch a main agent's action, infer its goal, and help perform a common household task faster in realistic virtual home environments. Experiments show that our helper agent robustly updates its goal inference and adapts its helping plans to the changing level of uncertainty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05223v1-abstract-full').style.display = 'none'; document.getElementById('2301.05223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://www.tshu.io/online_watch_and_help. Code: https://github.com/xavierpuigf/online_watch_and_help</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.02951">arXiv:2212.02951</a> <span> [<a href="https://arxiv.org/pdf/2212.02951">pdf</a>, <a href="https://arxiv.org/format/2212.02951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> State Space Closure: Revisiting Endless Online Level Generation via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jialin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.02951v2-abstract-short" style="display: inline;"> In this paper, we revisit endless online level generation with the recently proposed experience-driven procedural content generation via reinforcement learning (EDRL) framework. Inspired by an observation that EDRL tends to generate recurrent patterns, we formulate a notion of state space closure which makes any stochastic state appeared possibly in an infinite-horizon online generation process ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.02951v2-abstract-full').style.display = 'inline'; document.getElementById('2212.02951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.02951v2-abstract-full" style="display: none;"> In this paper, we revisit endless online level generation with the recently proposed experience-driven procedural content generation via reinforcement learning (EDRL) framework. Inspired by an observation that EDRL tends to generate recurrent patterns, we formulate a notion of state space closure which makes any stochastic state appeared possibly in an infinite-horizon online generation process can be found within a finite-horizon. Through theoretical analysis, we find that even though state space closure arises a concern about diversity, it generalises EDRL trained with a finite-horizon to the infinite-horizon scenario without deterioration of content quality. Moreover, we verify the quality and the diversity of contents generated by EDRL via empirical studies, on the widely used Super Mario Bros. benchmark. Experimental results reveal that the diversity of levels generated by EDRL is limited due to the state space closure, whereas their quality does not deteriorate in a horizon which is longer than the one specified in the training. Concluding our outcomes and analysis, future work on endless online level generation via reinforcement learning should address the issue of diversity while assuring the occurrence of state space closure and quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.02951v2-abstract-full').style.display = 'none'; document.getElementById('2212.02951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the IEEE Transactions on Games</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.15339">arXiv:2211.15339</a> <span> [<a href="https://arxiv.org/pdf/2211.15339">pdf</a>, <a href="https://arxiv.org/format/2211.15339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Discovering Generalizable Spatial Goal Representations via Graph-based Active Reward Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Netanyahu%2C+A">Aviv Netanyahu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J">Joshua Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+P">Pulkit Agrawal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.15339v1-abstract-short" style="display: inline;"> In this work, we consider one-shot imitation learning for object rearrangement tasks, where an AI agent needs to watch a single expert demonstration and learn to perform the same task in different environments. To achieve a strong generalization, the AI agent must infer the spatial goal specification for the task. However, there can be multiple goal specifications that fit the given demonstration.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15339v1-abstract-full').style.display = 'inline'; document.getElementById('2211.15339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.15339v1-abstract-full" style="display: none;"> In this work, we consider one-shot imitation learning for object rearrangement tasks, where an AI agent needs to watch a single expert demonstration and learn to perform the same task in different environments. To achieve a strong generalization, the AI agent must infer the spatial goal specification for the task. However, there can be multiple goal specifications that fit the given demonstration. To address this, we propose a reward learning approach, Graph-based Equivalence Mappings (GEM), that can discover spatial goal representations that are aligned with the intended goal specification, enabling successful generalization in unseen environments. Specifically, GEM represents a spatial goal specification by a reward function conditioned on i) a graph indicating important spatial relationships between objects and ii) state equivalence mappings for each edge in the graph indicating invariant properties of the corresponding relationship. GEM combines inverse reinforcement learning and active reward learning to efficiently improve the reward function by utilizing the graph structure and domain randomization enabled by the equivalence mappings. We conducted experiments with simulated oracles and with human subjects. The results show that GEM can drastically improve the generalizability of the learned goal representations over strong baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15339v1-abstract-full').style.display = 'none'; document.getElementById('2211.15339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2022, the first two authors contributed equally, project page https://www.tshu.io/GEM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03022">arXiv:2210.03022</a> <span> [<a href="https://arxiv.org/pdf/2210.03022">pdf</a>, <a href="https://arxiv.org/format/2210.03022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Stateful active facilitator: Coordination and Environmental Heterogeneity in Cooperative Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dianbo Liu</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+V">Vedant Shah</a>, <a href="/search/cs?searchtype=author&query=Boussif%2C+O">Oussama Boussif</a>, <a href="/search/cs?searchtype=author&query=Meo%2C+C">Cristian Meo</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Mozer%2C+M">Michael Mozer</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03022v3-abstract-short" style="display: inline;"> In cooperative multi-agent reinforcement learning, a team of agents works together to achieve a common goal. Different environments or tasks may require varying degrees of coordination among agents in order to achieve the goal in an optimal way. The nature of coordination will depend on the properties of the environment -- its spatial layout, distribution of obstacles, dynamics, etc. We term this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03022v3-abstract-full').style.display = 'inline'; document.getElementById('2210.03022v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03022v3-abstract-full" style="display: none;"> In cooperative multi-agent reinforcement learning, a team of agents works together to achieve a common goal. Different environments or tasks may require varying degrees of coordination among agents in order to achieve the goal in an optimal way. The nature of coordination will depend on the properties of the environment -- its spatial layout, distribution of obstacles, dynamics, etc. We term this variation of properties within an environment as heterogeneity. Existing literature has not sufficiently addressed the fact that different environments may have different levels of heterogeneity. We formalize the notions of coordination level and heterogeneity level of an environment and present HECOGrid, a suite of multi-agent RL environments that facilitates empirical evaluation of different MARL approaches across different levels of coordination and environmental heterogeneity by providing a quantitative control over coordination and heterogeneity levels of the environment. Further, we propose a Centralized Training Decentralized Execution learning approach called Stateful Active Facilitator (SAF) that enables agents to work efficiently in high-coordination and high-heterogeneity environments through a differentiable and shared knowledge source used during training and dynamic selection from a shared pool of policies. We evaluate SAF and compare its performance against baselines IPPO and MAPPO on HECOGrid. Our results show that SAF consistently outperforms the baselines across different tasks and different heterogeneity and coordination levels. We release the code for HECOGrid as well as all our experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03022v3-abstract-full').style.display = 'none'; document.getElementById('2210.03022v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ICLR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.10255">arXiv:2209.10255</a> <span> [<a href="https://arxiv.org/pdf/2209.10255">pdf</a>, <a href="https://arxiv.org/format/2209.10255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> PTSG: a test generation tool based on extended finite state machine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhijie Pan</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Ting Shu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zuohua Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.10255v1-abstract-short" style="display: inline;"> The Extended Finite State Machine (EFSM) is one of the most popular modeling approaches for model-based testing. However, EFSM-based test case generation is susceptible to the infeasible (inexecutable) path problem, which stems from the conflict of predicates (guards) between transitions in the path. Therefore, in order to derive feasible test cases, a test generation algorithm needs to dynamicall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.10255v1-abstract-full').style.display = 'inline'; document.getElementById('2209.10255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.10255v1-abstract-full" style="display: none;"> The Extended Finite State Machine (EFSM) is one of the most popular modeling approaches for model-based testing. However, EFSM-based test case generation is susceptible to the infeasible (inexecutable) path problem, which stems from the conflict of predicates (guards) between transitions in the path. Therefore, in order to derive feasible test cases, a test generation algorithm needs to dynamically acquire information about the model and verify the feasibility of the generated test path through the simulation execution of the model. The traditional method of constructing executable models using hard-coding for different EFSM models under test has limitations such as inflexibility, time-consuming and error-prone. To address this issue, this paper develops an open source test generation tool for testing EFSM-specified systems, PTSG, to support the automatic generation of executable test cases. It decouples the EFSM model description, parsing and simulation execution functions from the test generation algorithm, which can effectively improve the efficiency and quality of test generation. In particular, PTSG first uses a well-designed JSON syntax to describe the specific EFSM under test. Then, based on the model description file, it uses lexical and syntactic parsers to dynamically extract model information to construct various model objects in memory such as state configurations, transitions, etc. Finally, the tool provide a series of service interfaces to support model information acquisition, transition feasibility evaluation, and model simulation execution. A case study of test sequence generation for the SCP protocol model demonstrates the capability and promise of the PTSG to serve executable test cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.10255v1-abstract-full').style.display = 'none'; document.getElementById('2209.10255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.03100">arXiv:2209.03100</a> <span> [<a href="https://arxiv.org/pdf/2209.03100">pdf</a>, <a href="https://arxiv.org/format/2209.03100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Effects of Archive Size on Computation Time and Solution Quality for Multi-Objective Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+K">Ke Shang</a>, <a href="/search/cs?searchtype=author&query=Ishibuchi%2C+H">Hisao Ishibuchi</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yang Nan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.03100v2-abstract-short" style="display: inline;"> An unbounded external archive has been used to store all nondominated solutions found by an evolutionary multi-objective optimization algorithm in some studies. It has been shown that a selected solution subset from the stored solutions is often better than the final population. However, the use of the unbounded archive is not always realistic. When the number of examined solutions is huge, we mus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.03100v2-abstract-full').style.display = 'inline'; document.getElementById('2209.03100v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.03100v2-abstract-full" style="display: none;"> An unbounded external archive has been used to store all nondominated solutions found by an evolutionary multi-objective optimization algorithm in some studies. It has been shown that a selected solution subset from the stored solutions is often better than the final population. However, the use of the unbounded archive is not always realistic. When the number of examined solutions is huge, we must pre-specify the archive size. In this study, we examine the effects of the archive size on three aspects: (i) the quality of the selected final solution set, (ii) the total computation time for the archive maintenance and the final solution set selection, and (iii) the required memory size. Unsurprisingly, the increase of the archive size improves the final solution set quality. Interestingly, the total computation time of a medium-size archive is much larger than that of a small-size archive and a huge-size archive (e.g., an unbounded archive). To decrease the computation time, we examine two ideas: periodical archive update and archiving only in later generations. Compared with updating the archive at every generation, the first idea can obtain almost the same final solution set quality using a much shorter computation time at the cost of a slight increase of the memory size. The second idea drastically decreases the computation time at the cost of a slight deterioration of the final solution set quality. Based on our experimental results, some suggestions are given about how to appropriately choose an archiving strategy and an archive size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.03100v2-abstract-full').style.display = 'none'; document.getElementById('2209.03100v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.12327">arXiv:2207.12327</a> <span> [<a href="https://arxiv.org/pdf/2207.12327">pdf</a>, <a href="https://arxiv.org/format/2207.12327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Technical Report: Assisting Backdoor Federated Learning with Whole Population Knowledge Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tian Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xueyang Hu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.12327v1-abstract-short" style="display: inline;"> Due to the distributed nature of Federated Learning (FL), researchers have uncovered that FL is vulnerable to backdoor attacks, which aim at injecting a sub-task into the FL without corrupting the performance of the main task. Single-shot backdoor attack achieves high accuracy on both the main task and backdoor sub-task when injected at the FL model convergence. However, the early-injected single-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.12327v1-abstract-full').style.display = 'inline'; document.getElementById('2207.12327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.12327v1-abstract-full" style="display: none;"> Due to the distributed nature of Federated Learning (FL), researchers have uncovered that FL is vulnerable to backdoor attacks, which aim at injecting a sub-task into the FL without corrupting the performance of the main task. Single-shot backdoor attack achieves high accuracy on both the main task and backdoor sub-task when injected at the FL model convergence. However, the early-injected single-shot backdoor attack is ineffective because: (1) the maximum backdoor effectiveness is not reached at injection because of the dilution effect from normal local updates; (2) the backdoor effect decreases quickly as the backdoor will be overwritten by the newcoming normal local updates. In this paper, we strengthen the early-injected single-shot backdoor attack utilizing FL model information leakage. We show that the FL convergence can be expedited if the client trains on a dataset that mimics the distribution and gradients of the whole population. Based on this observation, we proposed a two-phase backdoor attack, which includes a preliminary phase for the subsequent backdoor attack. In the preliminary phase, the attacker-controlled client first launches a whole population distribution inference attack and then trains on a locally crafted dataset that is aligned with both the gradient and inferred distribution. Benefiting from the preliminary phase, the later injected backdoor achieves better effectiveness as the backdoor effect will be less likely to be diluted by the normal model updates. Extensive experiments are conducted on MNIST dataset under various data heterogeneity settings to evaluate the effectiveness of the proposed backdoor attack. Results show that the proposed backdoor outperforms existing backdoor attacks in both success rate and longevity, even when defense mechanisms are in place. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.12327v1-abstract-full').style.display = 'none'; document.getElementById('2207.12327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.12548">arXiv:2205.12548</a> <span> [<a href="https://arxiv.org/pdf/2205.12548">pdf</a>, <a href="https://arxiv.org/format/2205.12548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RLPrompt: Optimizing Discrete Text Prompts with Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+M">Mingkai Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianyu Wang</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cheng-Ping Hsieh</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yihan Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Han Guo</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Meng Song</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E+P">Eric P. Xing</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.12548v3-abstract-short" style="display: inline;"> Prompting has shown impressive success in enabling large pretrained language models (LMs) to perform diverse NLP tasks, especially when only few downstream data are available. Automatically finding the optimal prompt for each task, however, is challenging. Most existing work resorts to tuning soft prompt (e.g., embeddings) which falls short of interpretability, reusability across LMs, and applicab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12548v3-abstract-full').style.display = 'inline'; document.getElementById('2205.12548v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.12548v3-abstract-full" style="display: none;"> Prompting has shown impressive success in enabling large pretrained language models (LMs) to perform diverse NLP tasks, especially when only few downstream data are available. Automatically finding the optimal prompt for each task, however, is challenging. Most existing work resorts to tuning soft prompt (e.g., embeddings) which falls short of interpretability, reusability across LMs, and applicability when gradients are not accessible. Discrete prompt, on the other hand, is difficult to optimize, and is often created by "enumeration (e.g., paraphrasing)-then-selection" heuristics that do not explore the prompt space systematically. This paper proposes RLPrompt, an efficient discrete prompt optimization approach with reinforcement learning (RL). RLPrompt formulates a parameter-efficient policy network that generates the desired discrete prompt after training with reward. To overcome the complexity and stochasticity of reward signals by the large LM environment, we incorporate effective reward stabilization that substantially enhances the training efficiency. RLPrompt is flexibly applicable to different types of LMs, such as masked (e.g., BERT) and left-to-right models (e.g., GPTs), for both classification and generation tasks. Experiments on few-shot classification and unsupervised text style transfer show superior performance over a wide range of existing finetuning or prompting methods. Interestingly, the resulting optimized prompts are often ungrammatical gibberish text; and surprisingly, those gibberish prompts are transferrable between different LMs to retain significant performance, indicating LM prompting may not follow human language patterns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12548v3-abstract-full').style.display = 'none'; document.getElementById('2205.12548v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2022 Camera Ready. Code available at https://github.com/mingkaid/rl-prompt</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.11087">arXiv:2205.11087</a> <span> [<a href="https://arxiv.org/pdf/2205.11087">pdf</a>, <a href="https://arxiv.org/ps/2205.11087">ps</a>, <a href="https://arxiv.org/format/2205.11087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> MetaSlicing: A Novel Resource Allocation Framework for Metaverse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+N+H">Nam H. Chu</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+D+T">Dinh Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+D+N">Diep N. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+K+T">Khoa T. Phan</a>, <a href="/search/cs?searchtype=author&query=Dutkiewicz%2C+E">Eryk Dutkiewicz</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.11087v3-abstract-short" style="display: inline;"> Creating and maintaining the Metaverse requires enormous resources that have never been seen before, especially computing resources for intensive data processing to support the Extended Reality, enormous storage resources, and massive networking resources for maintaining ultra high-speed and low-latency connections. Therefore, this work aims to propose a novel framework, namely MetaSlicing, that c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.11087v3-abstract-full').style.display = 'inline'; document.getElementById('2205.11087v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.11087v3-abstract-full" style="display: none;"> Creating and maintaining the Metaverse requires enormous resources that have never been seen before, especially computing resources for intensive data processing to support the Extended Reality, enormous storage resources, and massive networking resources for maintaining ultra high-speed and low-latency connections. Therefore, this work aims to propose a novel framework, namely MetaSlicing, that can provide a highly effective and comprehensive solution in managing and allocating different types of resources for Metaverse applications. In particular, by observing that Metaverse applications may have common functions, we first propose grouping applications into clusters, called MetaInstances. In a MetaInstance, common functions can be shared among applications. As such, the same resources can be used by multiple applications simultaneously, thereby enhancing resource utilization dramatically.To address the real-time characteristic and resource demand's dynamic and uncertainty in the Metaverse, we develop an effective framework based on the semi-Markov decision process and propose an intelligent admission control algorithm that can maximize resource utilization and enhance the Quality-of-Service for end-users. Extensive simulation results show that our proposed solution outperforms the Greedy-based policies by up to 80% and 47% in terms of long-term revenue for Metaverse providers and request acceptance probability, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.11087v3-abstract-full').style.display = 'none'; document.getElementById('2205.11087v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Revised figures, fix typos</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.10607">arXiv:2205.10607</a> <span> [<a href="https://arxiv.org/pdf/2205.10607">pdf</a>, <a href="https://arxiv.org/format/2205.10607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Coordinating Policies Among Multiple Agents via an Intelligent Communication Channel </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dianbo Liu</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+V">Vedant Shah</a>, <a href="/search/cs?searchtype=author&query=Boussif%2C+O">Oussama Boussif</a>, <a href="/search/cs?searchtype=author&query=Meo%2C+C">Cristian Meo</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Mozer%2C+M">Michael Mozer</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.10607v2-abstract-short" style="display: inline;"> In Multi-Agent Reinforcement Learning (MARL), specialized channels are often introduced that allow agents to communicate directly with one another. In this paper, we propose an alternative approach whereby agents communicate through an intelligent facilitator that learns to sift through and interpret signals provided by all agents to improve the agents' collective performance. To ensure that this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.10607v2-abstract-full').style.display = 'inline'; document.getElementById('2205.10607v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.10607v2-abstract-full" style="display: none;"> In Multi-Agent Reinforcement Learning (MARL), specialized channels are often introduced that allow agents to communicate directly with one another. In this paper, we propose an alternative approach whereby agents communicate through an intelligent facilitator that learns to sift through and interpret signals provided by all agents to improve the agents' collective performance. To ensure that this facilitator does not become a centralized controller, agents are incentivized to reduce their dependence on the messages it conveys, and the messages can only influence the selection of a policy from a fixed set, not instantaneous actions given the policy. We demonstrate the strength of this architecture over existing baselines on several cooperative MARL environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.10607v2-abstract-full').style.display = 'none'; document.getElementById('2205.10607v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.06707">arXiv:2201.06707</a> <span> [<a href="https://arxiv.org/pdf/2201.06707">pdf</a>, <a href="https://arxiv.org/format/2201.06707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning to Approximate: Auto Direction Vector Set Generation for Hypervolume Contribution Approximation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shang%2C+K">Ke Shang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Ishibuchi%2C+H">Hisao Ishibuchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.06707v1-abstract-short" style="display: inline;"> Hypervolume contribution is an important concept in evolutionary multi-objective optimization (EMO). It involves in hypervolume-based EMO algorithms and hypervolume subset selection algorithms. Its main drawback is that it is computationally expensive in high-dimensional spaces, which limits its applicability to many-objective optimization. Recently, an R2 indicator variant (i.e.,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.06707v1-abstract-full').style.display = 'inline'; document.getElementById('2201.06707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.06707v1-abstract-full" style="display: none;"> Hypervolume contribution is an important concept in evolutionary multi-objective optimization (EMO). It involves in hypervolume-based EMO algorithms and hypervolume subset selection algorithms. Its main drawback is that it is computationally expensive in high-dimensional spaces, which limits its applicability to many-objective optimization. Recently, an R2 indicator variant (i.e., $R_2^{\text{HVC}}$ indicator) is proposed to approximate the hypervolume contribution. The $R_2^{\text{HVC}}$ indicator uses line segments along a number of direction vectors for hypervolume contribution approximation. It has been shown that different direction vector sets lead to different approximation quality. In this paper, we propose \textit{Learning to Approximate (LtA)}, a direction vector set generation method for the $R_2^{\text{HVC}}$ indicator. The direction vector set is automatically learned from training data. The learned direction vector set can then be used in the $R_2^{\text{HVC}}$ indicator to improve its approximation quality. The usefulness of the proposed LtA method is examined by comparing it with other commonly-used direction vector set generation methods for the $R_2^{\text{HVC}}$ indicator. Experimental results suggest the superiority of LtA over the other methods for generating high quality direction vector sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.06707v1-abstract-full').style.display = 'none'; document.getElementById('2201.06707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is currently under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.06700">arXiv:2201.06700</a> <span> [<a href="https://arxiv.org/pdf/2201.06700">pdf</a>, <a href="https://arxiv.org/format/2201.06700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.ins.2022.11.155">10.1016/j.ins.2022.11.155 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Benchmarking Subset Selection from Large Candidate Solution Sets in Evolutionary Multi-objective Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shang%2C+K">Ke Shang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Ishibuchi%2C+H">Hisao Ishibuchi</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yang Nan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+L+M">Lie Meng Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.06700v2-abstract-short" style="display: inline;"> In the evolutionary multi-objective optimization (EMO) field, the standard practice is to present the final population of an EMO algorithm as the output. However, it has been shown that the final population often includes solutions which are dominated by other solutions generated and discarded in previous generations. Recently, a new EMO framework has been proposed to solve this issue by storing a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.06700v2-abstract-full').style.display = 'inline'; document.getElementById('2201.06700v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.06700v2-abstract-full" style="display: none;"> In the evolutionary multi-objective optimization (EMO) field, the standard practice is to present the final population of an EMO algorithm as the output. However, it has been shown that the final population often includes solutions which are dominated by other solutions generated and discarded in previous generations. Recently, a new EMO framework has been proposed to solve this issue by storing all the non-dominated solutions generated during the evolution in an archive and selecting a subset of solutions from the archive as the output. The key component in this framework is the subset selection from the archive which usually stores a large number of candidate solutions. However, most studies on subset selection focus on small candidate solution sets for environmental selection. There is no benchmark test suite for large-scale subset selection. This paper aims to fill this research gap by proposing a benchmark test suite for subset selection from large candidate solution sets, and comparing some representative methods using the proposed test suite. The proposed test suite together with the benchmarking studies provides a baseline for researchers to understand, use, compare, and develop subset selection methods in the EMO field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.06700v2-abstract-full').style.display = 'none'; document.getElementById('2201.06700v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.10298">arXiv:2110.10298</a> <span> [<a href="https://arxiv.org/pdf/2110.10298">pdf</a>, <a href="https://arxiv.org/format/2110.10298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Incorporating Rich Social Interactions Into MDPs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tejwani%2C+R">Ravi Tejwani</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Stankovits%2C+B">Bennett Stankovits</a>, <a href="/search/cs?searchtype=author&query=Gutfreund%2C+D">Dan Gutfreund</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Katz%2C+B">Boris Katz</a>, <a href="/search/cs?searchtype=author&query=Barbu%2C+A">Andrei Barbu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.10298v3-abstract-short" style="display: inline;"> Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microsociology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other's hidden rewards. This extended Social MDP allows us to encode the five basi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10298v3-abstract-full').style.display = 'inline'; document.getElementById('2110.10298v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.10298v3-abstract-full" style="display: none;"> Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microsociology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other's hidden rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions zero-shot in new environments; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the judgments of these Social MDPs align closely with those of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics, MDPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10298v3-abstract-full').style.display = 'none'; document.getElementById('2110.10298v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 39th International Conference on Robotics and Automation (ICRA 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.15877">arXiv:2106.15877</a> <span> [<a href="https://arxiv.org/pdf/2106.15877">pdf</a>, <a href="https://arxiv.org/format/2106.15877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Experience-Driven PCG via Reinforcement Learning: A Super Mario Bros Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jialin Liu</a>, <a href="/search/cs?searchtype=author&query=Yannakakis%2C+G+N">Georgios N. Yannakakis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.15877v2-abstract-short" style="display: inline;"> We introduce a procedural content generation (PCG) framework at the intersections of experience-driven PCG and PCG via reinforcement learning, named ED(PCG)RL, EDRL in short. EDRL is able to teach RL designers to generate endless playable levels in an online manner while respecting particular experiences for the player as designed in the form of reward functions. The framework is tested initially… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.15877v2-abstract-full').style.display = 'inline'; document.getElementById('2106.15877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.15877v2-abstract-full" style="display: none;"> We introduce a procedural content generation (PCG) framework at the intersections of experience-driven PCG and PCG via reinforcement learning, named ED(PCG)RL, EDRL in short. EDRL is able to teach RL designers to generate endless playable levels in an online manner while respecting particular experiences for the player as designed in the form of reward functions. The framework is tested initially in the Super Mario Bros game. In particular, the RL designers of Super Mario Bros generate and concatenate level segments while considering the diversity among the segments. The correctness of the generation is ensured by a neural net-assisted evolutionary level repairer and the playability of the whole level is determined through AI-based testing. Our agents in this EDRL implementation learn to maximise a quantification of Koster's principle of fun by moderating the degree of diversity across level segments. Moreover, we test their ability to design fun levels that are diverse over time and playable. Our proposed framework is capable of generating endless, playable Super Mario Bros levels with varying degrees of fun, deviation from earlier segments, and playability. EDRL can be generalised to any game that is built as a segment-based sequential process and features a built-in compressed representation of its game content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.15877v2-abstract-full').style.display = 'none'; document.getElementById('2106.15877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by the 2021 IEEE Conference on Games</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.04077">arXiv:2103.04077</a> <span> [<a href="https://arxiv.org/pdf/2103.04077">pdf</a>, <a href="https://arxiv.org/format/2103.04077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2022.3144779">10.1109/LRA.2022.3144779 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Show Me What You Can Do: Capability Calibration on Reachable Workspace for Human-Robot Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Luyao Yuan</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hongjing Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Song-Chun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.04077v3-abstract-short" style="display: inline;"> Aligning humans' assessment of what a robot can do with its true capability is crucial for establishing a common ground between human and robot partners when they collaborate on a joint task. In this work, we propose an approach to calibrate humans' estimate of a robot's reachable workspace through a small number of demonstrations before collaboration. We develop a novel motion planning method, RE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.04077v3-abstract-full').style.display = 'inline'; document.getElementById('2103.04077v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.04077v3-abstract-full" style="display: none;"> Aligning humans' assessment of what a robot can do with its true capability is crucial for establishing a common ground between human and robot partners when they collaborate on a joint task. In this work, we propose an approach to calibrate humans' estimate of a robot's reachable workspace through a small number of demonstrations before collaboration. We develop a novel motion planning method, REMP, which jointly optimizes the physical cost and the expressiveness of robot motion to reveal the robot's reachability to a human observer. Our experiments with human participants demonstrate that a short calibration using REMP can effectively bridge the gap between what a non-expert user thinks a robot can reach and the ground truth. We show that this calibration procedure not only results in better user perception, but also promotes more efficient human-robot collaborations in a subsequent joint task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.04077v3-abstract-full').style.display = 'none'; document.getElementById('2103.04077v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, IEEE Robotics and Automation Letters (RA-L), 2022</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> X. Gao, L. Yuan, T. Shu, H. Lu and S. -C. Zhu, "Show Me What You Can Do: Capability Calibration on Reachable Workspace for Human-Robot Collaboration," in IEEE Robotics and Automation Letters, doi: 10.1109/LRA.2022.3144779 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.01933">arXiv:2103.01933</a> <span> [<a href="https://arxiv.org/pdf/2103.01933">pdf</a>, <a href="https://arxiv.org/format/2103.01933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Netanyahu%2C+A">Aviv Netanyahu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Katz%2C+B">Boris Katz</a>, <a href="/search/cs?searchtype=author&query=Barbu%2C+A">Andrei Barbu</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.01933v2-abstract-short" style="display: inline;"> The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.01933v2-abstract-full').style.display = 'inline'; document.getElementById('2103.01933v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.01933v2-abstract-full" style="display: none;"> The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide range of real-life social interactions by including social concepts such as helping another agent. PHASE consists of 2D animations of pairs of agents moving in a continuous space generated procedurally using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating that humans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASE can serve as a difficult new challenge for developing new models that can recognize complex social interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.01933v2-abstract-full').style.display = 'none'; document.getElementById('2103.01933v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally; AAAI 2021; 13 pages, 7 figures; Project page: https://www.tshu.io/PHASE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.12321">arXiv:2102.12321</a> <span> [<a href="https://arxiv.org/pdf/2102.12321">pdf</a>, <a href="https://arxiv.org/format/2102.12321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AGENT: A Benchmark for Core Psychological Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Bhandwaldar%2C+A">Abhishek Bhandwaldar</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+K+A">Kevin A. Smith</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shari Liu</a>, <a href="/search/cs?searchtype=author&query=Gutfreund%2C+D">Dan Gutfreund</a>, <a href="/search/cs?searchtype=author&query=Spelke%2C+E">Elizabeth Spelke</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Ullman%2C+T+D">Tomer D. Ullman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.12321v4-abstract-short" style="display: inline;"> For machine agents to successfully interact with humans in real-world settings, they will need to develop an understanding of human mental life. Intuitive psychology, the ability to reason about hidden mental variables that drive observable actions, comes naturally to people: even pre-verbal infants can tell agents from objects, expecting agents to act efficiently to achieve goals given constraint… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.12321v4-abstract-full').style.display = 'inline'; document.getElementById('2102.12321v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.12321v4-abstract-full" style="display: none;"> For machine agents to successfully interact with humans in real-world settings, they will need to develop an understanding of human mental life. Intuitive psychology, the ability to reason about hidden mental variables that drive observable actions, comes naturally to people: even pre-verbal infants can tell agents from objects, expecting agents to act efficiently to achieve goals given constraints. Despite recent interest in machine agents that reason about other agents, it is not clear if such agents learn or hold the core psychology principles that drive human reasoning. Inspired by cognitive development studies on intuitive psychology, we present a benchmark consisting of a large dataset of procedurally generated 3D animations, AGENT (Action, Goal, Efficiency, coNstraint, uTility), structured around four scenarios (goal preferences, action efficiency, unobserved constraints, and cost-reward trade-offs) that probe key concepts of core intuitive psychology. We validate AGENT with human-ratings, propose an evaluation protocol emphasizing generalization, and compare two strong baselines built on Bayesian inverse planning and a Theory of Mind neural network. Our results suggest that to pass the designed tests of core intuitive psychology at human levels, a model must acquire or have built-in representations of how agents plan, combining utility computations and core knowledge of objects and physics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.12321v4-abstract-full').style.display = 'none'; document.getElementById('2102.12321v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2021, 12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.05622">arXiv:2011.05622</a> <span> [<a href="https://arxiv.org/pdf/2011.05622">pdf</a>, <a href="https://arxiv.org/format/2011.05622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TG.2022.3164242">10.1109/TG.2022.3164242 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning with Dual-Observation for General Video Game Playing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chengpeng Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hao Tong</a>, <a href="/search/cs?searchtype=author&query=Togelius%2C+J">Julian Togelius</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xin Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jialin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.05622v4-abstract-short" style="display: inline;"> Reinforcement learning algorithms have performed well in playing challenging board and video games. More and more studies focus on improving the generalisation ability of reinforcement learning algorithms. The General Video Game AI Learning Competition aims to develop agents capable of learning to play different game levels that were unseen during training. This paper summarises the five years' Ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.05622v4-abstract-full').style.display = 'inline'; document.getElementById('2011.05622v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.05622v4-abstract-full" style="display: none;"> Reinforcement learning algorithms have performed well in playing challenging board and video games. More and more studies focus on improving the generalisation ability of reinforcement learning algorithms. The General Video Game AI Learning Competition aims to develop agents capable of learning to play different game levels that were unseen during training. This paper summarises the five years' General Video Game AI Learning Competition editions. At each edition, three new games were designed. The training and test levels were designed separately in the first three editions. Since 2020, three test levels of each game were generated by perturbing or combining two training levels. Then, we present a novel reinforcement learning technique with dual-observation for general video game playing, assuming that it is more likely to observe similar local information in different levels rather than global information. Instead of directly inputting a single, raw pixel-based screenshot of the current game screen, our proposed general technique takes the encoded, transformed global and local observations of the game screen as two simultaneous inputs, aiming at learning local information for playing new levels. Our proposed technique is implemented with three state-of-the-art reinforcement learning algorithms and tested on the game set of the 2020 General Video Game AI Learning Competition. Ablation studies show the outstanding performance of using encoded, transformed global and local observations as input. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.05622v4-abstract-full').style.display = 'none'; document.getElementById('2011.05622v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been accepted by the IEEE Transactions on Games on March 21, 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.09890">arXiv:2010.09890</a> <span> [<a href="https://arxiv.org/pdf/2010.09890">pdf</a>, <a href="https://arxiv.org/format/2010.09890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Watch-And-Help: A Challenge for Social Perception and Human-AI Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Puig%2C+X">Xavier Puig</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilin Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yuan-Hong Liao</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.09890v2-abstract-short" style="display: inline;"> In this paper, we introduce Watch-And-Help (WAH), a challenge for testing social intelligence in agents. In WAH, an AI agent needs to help a human-like agent perform a complex household task efficiently. To succeed, the AI agent needs to i) understand the underlying goal of the task by watching a single demonstration of the human-like agent performing the same task (social perception), and ii) coo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.09890v2-abstract-full').style.display = 'inline'; document.getElementById('2010.09890v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.09890v2-abstract-full" style="display: none;"> In this paper, we introduce Watch-And-Help (WAH), a challenge for testing social intelligence in agents. In WAH, an AI agent needs to help a human-like agent perform a complex household task efficiently. To succeed, the AI agent needs to i) understand the underlying goal of the task by watching a single demonstration of the human-like agent performing the same task (social perception), and ii) coordinate with the human-like agent to solve the task in an unseen environment as fast as possible (human-AI collaboration). For this challenge, we build VirtualHome-Social, a multi-agent household environment, and provide a benchmark including both planning and learning based baselines. We evaluate the performance of AI agents with the human-like agent as well as with real humans using objective metrics and subjective user ratings. Experimental results demonstrate that the proposed challenge and virtual environment enable a systematic evaluation on the important aspects of machine social intelligence at scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.09890v2-abstract-full').style.display = 'none'; document.getElementById('2010.09890v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.06991">arXiv:2008.06991</a> <span> [<a href="https://arxiv.org/pdf/2008.06991">pdf</a>, <a href="https://arxiv.org/format/2008.06991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> In-situ Workflow Auto-tuning via Combining Performance Models of Component Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tong Shu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanfei Guo</a>, <a href="/search/cs?searchtype=author&query=Wozniak%2C+J">Justin Wozniak</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiaoning Ding</a>, <a href="/search/cs?searchtype=author&query=Foster%2C+I">Ian Foster</a>, <a href="/search/cs?searchtype=author&query=Kurc%2C+T">Tahsin Kurc</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.06991v1-abstract-short" style="display: inline;"> In-situ parallel workflows couple multiple component applications, such as simulation and analysis, via streaming data transfer. in order to avoid data exchange via shared file systems. Such workflows are challenging to configure for optimal performance due to the large space of possible configurations. Expert experience is rarely sufficient to identify optimal configurations, and existing empiric… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.06991v1-abstract-full').style.display = 'inline'; document.getElementById('2008.06991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.06991v1-abstract-full" style="display: none;"> In-situ parallel workflows couple multiple component applications, such as simulation and analysis, via streaming data transfer. in order to avoid data exchange via shared file systems. Such workflows are challenging to configure for optimal performance due to the large space of possible configurations. Expert experience is rarely sufficient to identify optimal configurations, and existing empirical auto-tuning approaches are inefficient due to the high cost of obtaining training data for machine learning models. It is also infeasible to optimize individual components independently, due to component interactions. We propose here a new auto-tuning method, Component-based Ensemble Active Learning (CEAL), that combines machine learning techniques with knowledge of in-situ workflow structure to enable automated workflow configuration with a limited number of performance measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.06991v1-abstract-full').style.display = 'none'; document.getElementById('2008.06991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.12829">arXiv:2007.12829</a> <span> [<a href="https://arxiv.org/pdf/2007.12829">pdf</a>, <a href="https://arxiv.org/ps/2007.12829">ps</a>, <a href="https://arxiv.org/format/2007.12829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Joint Featurewise Weighting and Lobal Structure Learning for Multi-view Subspace Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lina%2C+S">Shi-Xun Lina</a>, <a href="/search/cs?searchtype=author&query=Zhongb%2C+G">Guo Zhongb</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Ting Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.12829v1-abstract-short" style="display: inline;"> Multi-view clustering integrates multiple feature sets, which reveal distinct aspects of the data and provide complementary information to each other, to improve the clustering performance. It remains challenging to effectively exploit complementary information across multiple views since the original data often contain noise and are highly redundant. Moreover, most existing multi-view clustering… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12829v1-abstract-full').style.display = 'inline'; document.getElementById('2007.12829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.12829v1-abstract-full" style="display: none;"> Multi-view clustering integrates multiple feature sets, which reveal distinct aspects of the data and provide complementary information to each other, to improve the clustering performance. It remains challenging to effectively exploit complementary information across multiple views since the original data often contain noise and are highly redundant. Moreover, most existing multi-view clustering methods only aim to explore the consistency of all views while ignoring the local structure of each view. However, it is necessary to take the local structure of each view into consideration, because different views would present different geometric structures while admitting the same cluster structure. To address the above issues, we propose a novel multi-view subspace clustering method via simultaneously assigning weights for different features and capturing local information of data in view-specific self-representation feature spaces. Especially, a common cluster structure regularization is adopted to guarantee consistency among different views. An efficient algorithm based on an augmented Lagrangian multiplier is also developed to solve the associated optimization problem. Experiments conducted on several benchmark datasets demonstrate that the proposed method achieves state-of-the-art performance. We provide the Matlab code on https://github.com/Ekin102003/JFLMSC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12829v1-abstract-full').style.display = 'none'; document.getElementById('2007.12829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.12803">arXiv:2007.12803</a> <span> [<a href="https://arxiv.org/pdf/2007.12803">pdf</a>, <a href="https://arxiv.org/format/2007.12803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Joint Mind Modeling for Explanation Generation in Complex Human-Robot Collaborative Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+R">Ran Gong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yizhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Song-Chun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.12803v1-abstract-short" style="display: inline;"> Human collaborators can effectively communicate with their partners to finish a common task by inferring each other's mental states (e.g., goals, beliefs, and desires). Such mind-aware communication minimizes the discrepancy among collaborators' mental states, and is crucial to the success in human ad-hoc teaming. We believe that robots collaborating with human users should demonstrate similar ped… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12803v1-abstract-full').style.display = 'inline'; document.getElementById('2007.12803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.12803v1-abstract-full" style="display: none;"> Human collaborators can effectively communicate with their partners to finish a common task by inferring each other's mental states (e.g., goals, beliefs, and desires). Such mind-aware communication minimizes the discrepancy among collaborators' mental states, and is crucial to the success in human ad-hoc teaming. We believe that robots collaborating with human users should demonstrate similar pedagogic behavior. Thus, in this paper, we propose a novel explainable AI (XAI) framework for achieving human-like communication in human-robot collaborations, where the robot builds a hierarchical mind model of the human user and generates explanations of its own mind as a form of communications based on its online Bayesian inference of the user's mental state. To evaluate our framework, we conduct a user study on a real-time human-robot cooking task. Experimental results show that the generated explanations of our approach significantly improves the collaboration performance and user perception of the robot. Code and video demos are available on our project website: https://xfgao.github.io/xCookingWeb/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12803v1-abstract-full').style.display = 'none'; document.getElementById('2007.12803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE International Conference on Robot and Human Interactive Communication (RO-MAN 2020), 8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.08037">arXiv:2007.08037</a> <span> [<a href="https://arxiv.org/pdf/2007.08037">pdf</a>, <a href="https://arxiv.org/format/2007.08037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Active Visual Information Gathering for Vision-Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanqing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenguan Wang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wei Liang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jianbing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.08037v3-abstract-short" style="display: inline;"> Vision-language navigation (VLN) is the task of entailing an agent to carry out navigational instructions inside photo-realistic environments. One of the key challenges in VLN is how to conduct a robust navigation by mitigating the uncertainty caused by ambiguous instructions and insufficient observation of the environment. Agents trained by current approaches typically suffer from this and would… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.08037v3-abstract-full').style.display = 'inline'; document.getElementById('2007.08037v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.08037v3-abstract-full" style="display: none;"> Vision-language navigation (VLN) is the task of entailing an agent to carry out navigational instructions inside photo-realistic environments. One of the key challenges in VLN is how to conduct a robust navigation by mitigating the uncertainty caused by ambiguous instructions and insufficient observation of the environment. Agents trained by current approaches typically suffer from this and would consequently struggle to avoid random and inefficient actions at every step. In contrast, when humans face such a challenge, they can still maintain robust navigation by actively exploring the surroundings to gather more information and thus make more confident navigation decisions. This work draws inspiration from human navigation behavior and endows an agent with an active information gathering ability for a more intelligent vision-language navigation policy. To achieve this, we propose an end-to-end framework for learning an exploration policy that decides i) when and where to explore, ii) what information is worth gathering during exploration, and iii) how to adjust the navigation decision after the exploration. The experimental results show promising exploration strategies emerged from training, which leads to significant boost in navigation performance. On the R2R challenge leaderboard, our agent gets promising results all three VLN settings, i.e., single run, pre-exploration, and beam search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.08037v3-abstract-full').style.display = 'none'; document.getElementById('2007.08037v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV2020 (changed with improved perfromance on Pre-Explore and Beam Search settings); website: https://github.com/HanqingWangAI/Active_VLN</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.06148">arXiv:2005.06148</a> <span> [<a href="https://arxiv.org/pdf/2005.06148">pdf</a>, <a href="https://arxiv.org/format/2005.06148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> A Novel CNet-assisted Evolutionary Level Repairer and Its Applications to Super Mario Bros </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianye Shu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jialin Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xin Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.06148v2-abstract-short" style="display: inline;"> Applying latent variable evolution to game level design has become more and more popular as little human expert knowledge is required. However, defective levels with illegal patterns may be generated due to the violation of constraints for level design. A traditional way of repairing the defective levels is programming specific rule-based repairers to patch the flaw. However, programming these con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.06148v2-abstract-full').style.display = 'inline'; document.getElementById('2005.06148v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.06148v2-abstract-full" style="display: none;"> Applying latent variable evolution to game level design has become more and more popular as little human expert knowledge is required. However, defective levels with illegal patterns may be generated due to the violation of constraints for level design. A traditional way of repairing the defective levels is programming specific rule-based repairers to patch the flaw. However, programming these constraints is sometimes complex and not straightforward. An autonomous level repairer which is capable of learning the constraints is needed. In this paper, we propose a novel approach, CNet, to learn the probability distribution of tiles giving its surrounding tiles on a set of real levels, and then detect the illegal tiles in generated new levels. Then, an evolutionary repairer is designed to search for optimal replacement schemes equipped with a novel search space being constructed with the help of CNet and a novel heuristic function. The proposed approaches are proved to be effective in our case study of repairing GAN-generated and artificially destroyed levels of Super Mario Bros. game. Our CNet-assisted evolutionary repairer can also be easily applied to other games of which the levels can be represented by a matrix of objects or tiles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.06148v2-abstract-full').style.display = 'none'; document.getElementById('2005.06148v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE CEC2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.09276">arXiv:2001.09276</a> <span> [<a href="https://arxiv.org/pdf/2001.09276">pdf</a>, <a href="https://arxiv.org/format/2001.09276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MCOM.2019.1800272">10.1109/MCOM.2019.1800272 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multi-operator Network Sharing for Massive IoT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yong Xiao</a>, <a href="/search/cs?searchtype=author&query=Krunz%2C+M">Marwan Krunz</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tao Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.09276v1-abstract-short" style="display: inline;"> Recent study predicts that by 2020 up to 50 billion IoT devices will be connected to the Internet, straining the capacity of wireless network that has already been overloaded with data-hungry mobile applications, such as high-definition video streaming and virtual reality(VR)/augmented reality(AR). How to accommodate the demand for both massive scale of IoT devices and high-speed cellular services… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.09276v1-abstract-full').style.display = 'inline'; document.getElementById('2001.09276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.09276v1-abstract-full" style="display: none;"> Recent study predicts that by 2020 up to 50 billion IoT devices will be connected to the Internet, straining the capacity of wireless network that has already been overloaded with data-hungry mobile applications, such as high-definition video streaming and virtual reality(VR)/augmented reality(AR). How to accommodate the demand for both massive scale of IoT devices and high-speed cellular services in the physically limited spectrum without significantly increasing the operational and infrastructure costs is one of the main challenges for operators. In this article, we introduce a new multi-operator network sharing framework that supports the coexistence of IoT and high-speed cellular services. Our framework is based on the radio access network (RAN) sharing architecture recently introduced by 3GPP as a promising solution for operators to improve their resource utilization and reduce the system roll-out cost. We evaluate the performance of our proposed framework using the real base station location data in the city of Dublin collected from two major operators in Ireland. Numerical results show that our proposed framework can almost double the total number of IoT devices that can be supported and coexist with other cellular services compared with the case without network sharing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.09276v1-abstract-full').style.display = 'none'; document.getElementById('2001.09276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at IEEE Communication Magazine, vol. 57, no. 4, pp. 96-101, April 2019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Communication Magazine, vol. 57, no. 4, pp. 96-101, April 2019 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shu%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shu%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shu%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository