Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 283 results for author: <span class="mathjax">Peng, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Peng%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Peng, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Peng%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Peng, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20927">arXiv:2410.20927</a> <span> [<a href="https://arxiv.org/pdf/2410.20927">pdf</a>, <a href="https://arxiv.org/format/2410.20927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VLMimic: Vision Language Models are Visual Imitation Learner for Fine-grained Actions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanyan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meiling Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+T">Te Cui</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoyang Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianxing Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zicai Peng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mengxiao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yufeng Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20927v3-abstract-short" style="display: inline;"> Visual imitation learning (VIL) provides an efficient and intuitive strategy for robotic systems to acquire novel skills. Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable performance in vision and language reasoning capabilities for VIL tasks. Despite the progress, current VIL methods naively employ VLMs to learn high-level plans from human videos, relying on pre-d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20927v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20927v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20927v3-abstract-full" style="display: none;"> Visual imitation learning (VIL) provides an efficient and intuitive strategy for robotic systems to acquire novel skills. Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable performance in vision and language reasoning capabilities for VIL tasks. Despite the progress, current VIL methods naively employ VLMs to learn high-level plans from human videos, relying on pre-defined motion primitives for executing physical interactions, which remains a major bottleneck. In this work, we present VLMimic, a novel paradigm that harnesses VLMs to directly learn even fine-grained action levels, only given a limited number of human videos. Specifically, VLMimic first grounds object-centric movements from human videos, and learns skills using hierarchical constraint representations, facilitating the derivation of skills with fine-grained action levels from limited human videos. These skills are refined and updated through an iterative comparison strategy, enabling efficient adaptation to unseen environments. Our extensive experiments exhibit that our VLMimic, using only 5 human videos, yields significant improvements of over 27% and 21% in RLBench and real-world manipulation tasks, and surpasses baselines by over 37% in long-horizon tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20927v3-abstract-full').style.display = 'none'; document.getElementById('2410.20927v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted for publication in the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18002">arXiv:2410.18002</a> <span> [<a href="https://arxiv.org/pdf/2410.18002">pdf</a>, <a href="https://arxiv.org/format/2410.18002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Digital Network Twins for Next-generation Wireless: Creation, Optimization, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuchen Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hanzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingzhe Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18002v1-abstract-short" style="display: inline;"> Digital network twins (DNTs), by representing a physical network using a virtual model, offer significant benefits such as streamlined network development, enhanced productivity, and cost reduction for next-generation (nextG) communication infrastructure. Existing works mainly describe the deployment of DNT technologies in various service sections.The full life cycle of DNTs for telecommunication… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18002v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18002v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18002v1-abstract-full" style="display: none;"> Digital network twins (DNTs), by representing a physical network using a virtual model, offer significant benefits such as streamlined network development, enhanced productivity, and cost reduction for next-generation (nextG) communication infrastructure. Existing works mainly describe the deployment of DNT technologies in various service sections.The full life cycle of DNTs for telecommunication has not yet been comprehensively studied, particularly in the aspects of fine-grained creation, real-time adaptation, resource-efficient deployment, and security protection. This article presents an in-depth overview of DNTs, exploring their concrete integration into networks and communication, covering the fundamental designs, the emergent applications, and critical challenges in multiple dimensions. We also include two detailed case studies to illustrate how DNTs can be applied in real-world scenarios such as wireless traffic forecasting and edge caching. Additionally, a forward-looking vision of the research opportunities in tackling the challenges of DNTs is provided, aiming to fully maximize the benefits of DNTs in nextG networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18002v1-abstract-full').style.display = 'none'; document.getElementById('2410.18002v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15724">arXiv:2410.15724</a> <span> [<a href="https://arxiv.org/pdf/2410.15724">pdf</a>, <a href="https://arxiv.org/format/2410.15724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Universally Accessible Cross-Chain Options without Upfront Holder Collateral </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zifan Peng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yingjie Xue</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15724v1-abstract-short" style="display: inline;"> Options are fundamental to blockchain-based financial markets, offering essential tools for risk management and price speculation, which enhance liquidity, flexibility, and market efficiency in decentralized finance (DeFi). Despite the growing interest in options for blockchain-resident assets, such as cryptocurrencies, current option mechanisms face significant challenges, including limited asset… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15724v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15724v1-abstract-full" style="display: none;"> Options are fundamental to blockchain-based financial markets, offering essential tools for risk management and price speculation, which enhance liquidity, flexibility, and market efficiency in decentralized finance (DeFi). Despite the growing interest in options for blockchain-resident assets, such as cryptocurrencies, current option mechanisms face significant challenges, including limited asset support, high trading delays, and the requirement for option holders to provide upfront collateral. In this paper, we present a protocol that addresses the aforementioned issues by facilitating efficient and universally accessible option trading without requiring holders to post collateral when establishing options. Our protocol's universality allows for cross-chain options involving nearly $\textit{any}$ assets on $\textit{any}$ two different blockchains, provided the chains' programming languages can enforce and execute the necessary contract logic. A key innovation in our approach is the use of Double-Authentication-Preventing Signatures (DAPS), which significantly reduces trading latency. Additionally, by introducing a guarantee from the option writer, our protocol removes the need of upfront collateral from holders. Our evaluation demonstrates that the proposed scheme reduces option transfer latency to less than half of that in existing methods. Rigorous security analysis proves that our protocol achieves secure option trading, even when facing adversarial behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15724v1-abstract-full').style.display = 'none'; document.getElementById('2410.15724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14567">arXiv:2410.14567</a> <span> [<a href="https://arxiv.org/pdf/2410.14567">pdf</a>, <a href="https://arxiv.org/format/2410.14567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RAG-ConfusionQA: A Benchmark for Evaluating LLMs on Confusing Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Nian%2C+J">Jinming Nian</a>, <a href="/search/cs?searchtype=author&query=Evfimievski%2C+A">Alexandre Evfimievski</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yi Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14567v1-abstract-short" style="display: inline;"> Conversational AI agents use Retrieval Augmented Generation (RAG) to provide verifiable document-grounded responses to user inquiries. However, many natural questions do not have good answers: about 25\% contain false assumptions~\cite{Yu2023:CREPE}, and over 50\% are ambiguous~\cite{Min2020:AmbigQA}. RAG agents need high-quality data to improve their responses to confusing questions. This paper p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14567v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14567v1-abstract-full" style="display: none;"> Conversational AI agents use Retrieval Augmented Generation (RAG) to provide verifiable document-grounded responses to user inquiries. However, many natural questions do not have good answers: about 25\% contain false assumptions~\cite{Yu2023:CREPE}, and over 50\% are ambiguous~\cite{Min2020:AmbigQA}. RAG agents need high-quality data to improve their responses to confusing questions. This paper presents a novel synthetic data generation method to efficiently create a diverse set of context-grounded confusing questions from a given document corpus. We conduct an empirical comparative evaluation of several large language models as RAG agents to measure the accuracy of confusion detection and appropriate response generation. We contribute a benchmark dataset to the public domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14567v1-abstract-full').style.display = 'none'; document.getElementById('2410.14567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13905">arXiv:2410.13905</a> <span> [<a href="https://arxiv.org/pdf/2410.13905">pdf</a>, <a href="https://arxiv.org/format/2410.13905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> P4GCN: Vertical Federated Social Recommendation with Privacy-Preserving Two-Party Graph Convolution Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wanwan Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yimin Huang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhaopeng Peng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziqi Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaoliang Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13905v1-abstract-short" style="display: inline;"> In recent years, graph neural networks (GNNs) have been commonly utilized for social recommendation systems. However, real-world scenarios often present challenges related to user privacy and business constraints, inhibiting direct access to valuable social information from other platforms. While many existing methods have tackled matrix factorization-based social recommendations without direct so… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13905v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13905v1-abstract-full" style="display: none;"> In recent years, graph neural networks (GNNs) have been commonly utilized for social recommendation systems. However, real-world scenarios often present challenges related to user privacy and business constraints, inhibiting direct access to valuable social information from other platforms. While many existing methods have tackled matrix factorization-based social recommendations without direct social data access, developing GNN-based federated social recommendation models under similar conditions remains largely unexplored. To address this issue, we propose a novel vertical federated social recommendation method leveraging privacy-preserving two-party graph convolution networks (P4GCN) to enhance recommendation accuracy without requiring direct access to sensitive social information. First, we introduce a Sandwich-Encryption module to ensure comprehensive data privacy during the collaborative computing process. Second, we provide a thorough theoretical analysis of the privacy guarantees, considering the participation of both curious and honest parties. Extensive experiments on four real-world datasets demonstrate that P4GCN outperforms state-of-the-art methods in terms of recommendation accuracy. The code is available at https://github.com/WwZzz/P4GCN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13905v1-abstract-full').style.display = 'none'; document.getElementById('2410.13905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13639">arXiv:2410.13639</a> <span> [<a href="https://arxiv.org/pdf/2410.13639">pdf</a>, <a href="https://arxiv.org/format/2410.13639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study on Reasoning Patterns of OpenAI's o1 Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siwei Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhongyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tuney Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialong Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiachen Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wangchunshu Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qunshu Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J+H">J. H. Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13639v2-abstract-short" style="display: inline;"> Enabling Large Language Models (LLMs) to handle a wider range of complex tasks (e.g., coding, math) has drawn great attention from many researchers. As LLMs continue to evolve, merely increasing the number of model parameters yields diminishing performance improvements and heavy computational costs. Recently, OpenAI's o1 model has shown that inference strategies (i.e., Test-time Compute methods) c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13639v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13639v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13639v2-abstract-full" style="display: none;"> Enabling Large Language Models (LLMs) to handle a wider range of complex tasks (e.g., coding, math) has drawn great attention from many researchers. As LLMs continue to evolve, merely increasing the number of model parameters yields diminishing performance improvements and heavy computational costs. Recently, OpenAI's o1 model has shown that inference strategies (i.e., Test-time Compute methods) can also significantly enhance the reasoning capabilities of LLMs. However, the mechanisms behind these methods are still unexplored. In our work, to investigate the reasoning patterns of o1, we compare o1 with existing Test-time Compute methods (BoN, Step-wise BoN, Agent Workflow, and Self-Refine) by using OpenAI's GPT-4o as a backbone on general reasoning benchmarks in three domains (i.e., math, coding, commonsense reasoning). Specifically, first, our experiments show that the o1 model has achieved the best performance on most datasets. Second, as for the methods of searching diverse responses (e.g., BoN), we find the reward models' capability and the search space both limit the upper boundary of these methods. Third, as for the methods that break the problem into many sub-problems, the Agent Workflow has achieved better performance than Step-wise BoN due to the domain-specific system prompt for planning better reasoning processes. Fourth, it is worth mentioning that we have summarized six reasoning patterns of o1, and provided a detailed analysis on several reasoning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13639v2-abstract-full').style.display = 'none'; document.getElementById('2410.13639v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11710">arXiv:2410.11710</a> <span> [<a href="https://arxiv.org/pdf/2410.11710">pdf</a>, <a href="https://arxiv.org/format/2410.11710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MTU-Bench: A Multi-granularity Tool-Use Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoshuai Song</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhongyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Ken Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiakai Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Junran Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11710v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have displayed massive improvements in reasoning and decision-making skills and can hold natural conversations with users. Recently, many tool-use benchmark datasets have been proposed. However, existing datasets have the following limitations: (1). Insufficient evaluation scenarios (e.g., only cover limited tool-use scenes). (2). Extensive evaluation costs (e.g., GPT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11710v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11710v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have displayed massive improvements in reasoning and decision-making skills and can hold natural conversations with users. Recently, many tool-use benchmark datasets have been proposed. However, existing datasets have the following limitations: (1). Insufficient evaluation scenarios (e.g., only cover limited tool-use scenes). (2). Extensive evaluation costs (e.g., GPT API costs). To address these limitations, in this work, we propose a multi-granularity tool-use benchmark for large language models called MTU-Bench. For the "multi-granularity" property, our MTU-Bench covers five tool usage scenes (i.e., single-turn and single-tool, single-turn and multiple-tool, multiple-turn and single-tool, multiple-turn and multiple-tool, and out-of-distribution tasks). Besides, all evaluation metrics of our MTU-Bench are based on the prediction results and the ground truth without using any GPT or human evaluation metrics. Moreover, our MTU-Bench is collected by transforming existing high-quality datasets to simulate real-world tool usage scenarios, and we also propose an instruction dataset called MTU-Instruct data to enhance the tool-use abilities of existing LLMs. Comprehensive experimental results demonstrate the effectiveness of our MTU-Bench. Code and data will be released at https: //github.com/MTU-Bench-Team/MTU-Bench.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11710v1-abstract-full').style.display = 'none'; document.getElementById('2410.11710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08249">arXiv:2410.08249</a> <span> [<a href="https://arxiv.org/pdf/2410.08249">pdf</a>, <a href="https://arxiv.org/format/2410.08249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federated Graph Learning for Cross-Domain Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziqi Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhaopeng Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihui Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaochao Chen</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+W">Weike Pan</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chenglu Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaoliang Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08249v2-abstract-short" style="display: inline;"> Cross-domain recommendation (CDR) offers a promising solution to the data sparsity problem by enabling knowledge transfer across source and target domains. However, many recent CDR models overlook crucial issues such as privacy as well as the risk of negative transfer (which negatively impact model performance), especially in multi-domain settings. To address these challenges, we propose FedGCDR,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08249v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08249v2-abstract-full" style="display: none;"> Cross-domain recommendation (CDR) offers a promising solution to the data sparsity problem by enabling knowledge transfer across source and target domains. However, many recent CDR models overlook crucial issues such as privacy as well as the risk of negative transfer (which negatively impact model performance), especially in multi-domain settings. To address these challenges, we propose FedGCDR, a novel federated graph learning framework that securely and effectively leverages positive knowledge from multiple source domains. First, we design a positive knowledge transfer module that ensures privacy during inter-domain knowledge transmission. This module employs differential privacy-based knowledge extraction combined with a feature mapping mechanism, transforming source domain embeddings from federated graph attention networks into reliable domain knowledge. Second, we design a knowledge activation module to filter out potential harmful or conflicting knowledge from source domains, addressing the issues of negative transfer. This module enhances target domain training by expanding the graph of the target domain to generate reliable domain attentions and fine-tunes the target model for improved negative knowledge filtering and more accurate predictions. We conduct extensive experiments on 16 popular domains of the Amazon dataset, demonstrating that FedGCDR significantly outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08249v2-abstract-full').style.display = 'none'; document.getElementById('2410.08249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08134">arXiv:2410.08134</a> <span> [<a href="https://arxiv.org/pdf/2410.08134">pdf</a>, <a href="https://arxiv.org/format/2410.08134">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Steering Masked Discrete Diffusion Models via Discrete Denoising Posterior Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rector-Brooks%2C+J">Jarrid Rector-Brooks</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+M">Mohsin Hasan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhangzhi Peng</a>, <a href="/search/cs?searchtype=author&query=Quinn%2C+Z">Zachary Quinn</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Mittal%2C+S">Sarthak Mittal</a>, <a href="/search/cs?searchtype=author&query=Dziri%2C+N">Nouha Dziri</a>, <a href="/search/cs?searchtype=author&query=Bronstein%2C+M">Michael Bronstein</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Chatterjee%2C+P">Pranam Chatterjee</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+A">Alexander Tong</a>, <a href="/search/cs?searchtype=author&query=Bose%2C+A+J">Avishek Joey Bose</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08134v1-abstract-short" style="display: inline;"> Generative modeling of discrete data underlies important applications spanning text-based agents like ChatGPT to the design of the very building blocks of life in protein sequences. However, application domains need to exert control over the generated data by steering the generative process - typically via RLHF - to satisfy a specified property, reward, or affinity metric. In this paper, we study… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08134v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08134v1-abstract-full" style="display: none;"> Generative modeling of discrete data underlies important applications spanning text-based agents like ChatGPT to the design of the very building blocks of life in protein sequences. However, application domains need to exert control over the generated data by steering the generative process - typically via RLHF - to satisfy a specified property, reward, or affinity metric. In this paper, we study the problem of steering Masked Diffusion Models (MDMs), a recent class of discrete diffusion models that offer a compelling alternative to traditional autoregressive models. We introduce Discrete Denoising Posterior Prediction (DDPP), a novel framework that casts the task of steering pre-trained MDMs as a problem of probabilistic inference by learning to sample from a target Bayesian posterior. Our DDPP framework leads to a family of three novel objectives that are all simulation-free, and thus scalable while applying to general non-differentiable reward functions. Empirically, we instantiate DDPP by steering MDMs to perform class-conditional pixel-level image modeling, RLHF-based alignment of MDMs using text-based rewards, and finetuning protein language models to generate more diverse secondary structures and shorter proteins. We substantiate our designs via wet-lab validation, where we observe transient expression of reward-optimized protein sequences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08134v1-abstract-full').style.display = 'none'; document.getElementById('2410.08134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01262">arXiv:2410.01262</a> <span> [<a href="https://arxiv.org/pdf/2410.01262">pdf</a>, <a href="https://arxiv.org/format/2410.01262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Aggregation of Multi Diffusion Models for Enhancing Learned Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+C">Conghan Yue</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhengwei Peng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S">Shiyan Du</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zhi Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01262v1-abstract-short" style="display: inline;"> Diffusion models have achieved remarkable success in image generation, particularly with the various applications of classifier-free guidance conditional diffusion models. While many diffusion models perform well when controlling for particular aspect among style, character, and interaction, they struggle with fine-grained control due to dataset limitations and intricate model architecture design.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01262v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01262v1-abstract-full" style="display: none;"> Diffusion models have achieved remarkable success in image generation, particularly with the various applications of classifier-free guidance conditional diffusion models. While many diffusion models perform well when controlling for particular aspect among style, character, and interaction, they struggle with fine-grained control due to dataset limitations and intricate model architecture design. This paper introduces a novel algorithm, Aggregation of Multi Diffusion Models (AMDM), which synthesizes features from multiple diffusion models into a specified model, enhancing its learned representations to activate specific features for fine-grained control. AMDM consists of two key components: spherical aggregation and manifold optimization. Spherical aggregation merges intermediate variables from different diffusion models with minimal manifold deviation, while manifold optimization refines these variables to align with the intermediate data manifold, enhancing sampling quality. Experimental results demonstrate that AMDM significantly improves fine-grained control without additional training or inference time, proving its effectiveness. Additionally, it reveals that diffusion models initially focus on features such as position, attributes, and style, with later stages improving generation quality and consistency. AMDM offers a new perspective for tackling the challenges of fine-grained conditional control generation in diffusion models: We can fully utilize existing conditional diffusion models that control specific aspects, or develop new ones, and then aggregate them using the AMDM algorithm. This eliminates the need for constructing complex datasets, designing intricate model architectures, and incurring high training costs. Code is available at: https://github.com/Hammour-steak/AMDM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01262v1-abstract-full').style.display = 'none'; document.getElementById('2410.01262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19599">arXiv:2409.19599</a> <span> [<a href="https://arxiv.org/pdf/2409.19599">pdf</a>, <a href="https://arxiv.org/format/2409.19599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gradient is All You Need: Gradient-Based Attention Fusion for Infrared Small Target Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yian Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kexuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Luping Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiming Zhu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yufei Peng</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+T">Tian Pu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenming Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19599v1-abstract-short" style="display: inline;"> Infrared small target detection (IRSTD) is widely used in civilian and military applications. However, IRSTD encounters several challenges, including the tendency for small and dim targets to be obscured by complex backgrounds. To address this issue, we propose the Gradient Network (GaNet), which aims to extract and preserve edge and gradient information of small targets. GaNet employs the Gradien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19599v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19599v1-abstract-full" style="display: none;"> Infrared small target detection (IRSTD) is widely used in civilian and military applications. However, IRSTD encounters several challenges, including the tendency for small and dim targets to be obscured by complex backgrounds. To address this issue, we propose the Gradient Network (GaNet), which aims to extract and preserve edge and gradient information of small targets. GaNet employs the Gradient Transformer (GradFormer) module, simulating central difference convolutions (CDC) to extract and integrate gradient features with deeper features. Furthermore, we propose a global feature extraction model (GFEM) that offers a comprehensive perspective to prevent the network from focusing solely on details while neglecting the background information. We compare the network with state-of-the-art (SOTA) approaches, and the results demonstrate that our method performs effectively. Our source code is available at https://github.com/greekinRoma/Gradient-Transformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19599v1-abstract-full').style.display = 'none'; document.getElementById('2409.19599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18343">arXiv:2409.18343</a> <span> [<a href="https://arxiv.org/pdf/2409.18343">pdf</a>, <a href="https://arxiv.org/format/2409.18343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Agent Behaviors with RL Fine-tuning for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenghao Peng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wenjie Luo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiren Lu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tianyi Shen</a>, <a href="/search/cs?searchtype=author&query=Gulino%2C+C">Cole Gulino</a>, <a href="/search/cs?searchtype=author&query=Seff%2C+A">Ari Seff</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Justin Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18343v1-abstract-short" style="display: inline;"> A major challenge in autonomous vehicle research is modeling agent behaviors, which has critical applications including constructing realistic and reliable simulations for off-board evaluation and forecasting traffic agents motion for onboard planning. While supervised learning has shown success in modeling agents across various domains, these models can suffer from distribution shift when deploye… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18343v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18343v1-abstract-full" style="display: none;"> A major challenge in autonomous vehicle research is modeling agent behaviors, which has critical applications including constructing realistic and reliable simulations for off-board evaluation and forecasting traffic agents motion for onboard planning. While supervised learning has shown success in modeling agents across various domains, these models can suffer from distribution shift when deployed at test-time. In this work, we improve the reliability of agent behaviors by closed-loop fine-tuning of behavior models with reinforcement learning. Our method demonstrates improved overall performance, as well as improved targeted metrics such as collision rate, on the Waymo Open Sim Agents challenge. Additionally, we present a novel policy evaluation benchmark to directly assess the ability of simulated agents to measure the quality of autonomous vehicle planners and demonstrate the effectiveness of our approach on this new benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18343v1-abstract-full').style.display = 'none'; document.getElementById('2409.18343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16605">arXiv:2409.16605</a> <span> [<a href="https://arxiv.org/pdf/2409.16605">pdf</a>, <a href="https://arxiv.org/format/2409.16605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Enhancing Large Language Models for Novelty Assessment in Scholarly Publications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+E">Ethan Lin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yi Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16605v1-abstract-short" style="display: inline;"> Recent studies have evaluated the creativity/novelty of large language models (LLMs) primarily from a semantic perspective, using benchmarks from cognitive science. However, accessing the novelty in scholarly publications is a largely unexplored area in evaluating LLMs. In this paper, we introduce a scholarly novelty benchmark (SchNovel) to evaluate LLMs' ability to assess novelty in scholarly pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16605v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16605v1-abstract-full" style="display: none;"> Recent studies have evaluated the creativity/novelty of large language models (LLMs) primarily from a semantic perspective, using benchmarks from cognitive science. However, accessing the novelty in scholarly publications is a largely unexplored area in evaluating LLMs. In this paper, we introduce a scholarly novelty benchmark (SchNovel) to evaluate LLMs' ability to assess novelty in scholarly papers. SchNovel consists of 15000 pairs of papers across six fields sampled from the arXiv dataset with publication dates spanning 2 to 10 years apart. In each pair, the more recently published paper is assumed to be more novel. Additionally, we propose RAG-Novelty, which simulates the review process taken by human reviewers by leveraging the retrieval of similar papers to assess novelty. Extensive experiments provide insights into the capabilities of different LLMs to assess novelty and demonstrate that RAG-Novelty outperforms recent baseline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16605v1-abstract-full').style.display = 'none'; document.getElementById('2409.16605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15317">arXiv:2409.15317</a> <span> [<a href="https://arxiv.org/pdf/2409.15317">pdf</a>, <a href="https://arxiv.org/format/2409.15317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Shared Autonomy with IDA: Interventional Diffusion Assistance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McMahan%2C+B+J">Brandon J. McMahan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenghao Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bolei Zhou</a>, <a href="/search/cs?searchtype=author&query=Kao%2C+J+C">Jonathan C. Kao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15317v1-abstract-short" style="display: inline;"> The rapid development of artificial intelligence (AI) has unearthed the potential to assist humans in controlling advanced technologies. Shared autonomy (SA) facilitates control by combining inputs from a human pilot and an AI copilot. In prior SA studies, the copilot is constantly active in determining the action played at each time step. This limits human autonomy and may have deleterious effect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15317v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15317v1-abstract-full" style="display: none;"> The rapid development of artificial intelligence (AI) has unearthed the potential to assist humans in controlling advanced technologies. Shared autonomy (SA) facilitates control by combining inputs from a human pilot and an AI copilot. In prior SA studies, the copilot is constantly active in determining the action played at each time step. This limits human autonomy and may have deleterious effects on performance. In general, the amount of helpful copilot assistance can vary greatly depending on the task dynamics. We therefore hypothesize that human autonomy and SA performance improve through dynamic and selective copilot intervention. To address this, we develop a goal-agnostic intervention assistance (IA) that dynamically shares control by having the copilot intervene only when the expected value of the copilot's action exceeds that of the human's action across all possible goals. We implement IA with a diffusion copilot (termed IDA) trained on expert demonstrations with goal masking. We prove a lower bound on the performance of IA that depends on pilot and copilot performance. Experiments with simulated human pilots show that IDA achieves higher performance than pilot-only and traditional SA control in variants of the Reacher environment and Lunar Lander. We then demonstrate that IDA achieves better control in Lunar Lander with human-in-the-loop experiments. Human participants report greater autonomy with IDA and prefer IDA over pilot-only and traditional SA control. We attribute the success of IDA to preserving human autonomy while simultaneously offering assistance to prevent the human pilot from entering universally bad states. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15317v1-abstract-full').style.display = 'none'; document.getElementById('2409.15317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 main figures, 2 appendix figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14859">arXiv:2409.14859</a> <span> [<a href="https://arxiv.org/pdf/2409.14859">pdf</a>, <a href="https://arxiv.org/format/2409.14859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> MentalImager: Exploring Generative Images for Assisting Support-Seekers' Self-Disclosure in Online Mental Health Communities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuxiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Louie%2C+R">Ryan Louie</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taewook Kim</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingyu Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuailin Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenhui Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14859v1-abstract-short" style="display: inline;"> Support-seekers' self-disclosure of their suffering experiences, thoughts, and feelings in the post can help them get needed peer support in online mental health communities (OMHCs). However, such mental health self-disclosure could be challenging. Images can facilitate the manifestation of relevant experiences and feelings in the text; yet, relevant images are not always available. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14859v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14859v1-abstract-full" style="display: none;"> Support-seekers' self-disclosure of their suffering experiences, thoughts, and feelings in the post can help them get needed peer support in online mental health communities (OMHCs). However, such mental health self-disclosure could be challenging. Images can facilitate the manifestation of relevant experiences and feelings in the text; yet, relevant images are not always available. In this paper, we present a technical prototype named MentalImager and validate in a human evaluation study that it can generate topical- and emotional-relevant images based on the seekers' drafted posts or specified keywords. Two user studies demonstrate that MentalImager not only improves seekers' satisfaction with their self-disclosure in their posts but also invokes support-providers' empathy for the seekers and willingness to offer help. Such improvements are credited to the generated images, which help seekers express their emotions and inspire them to add more details about their experiences and feelings. We report concerns on MentalImager and discuss insights for supporting self-disclosure in OMHCs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14859v1-abstract-full').style.display = 'none'; document.getElementById('2409.14859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09740">arXiv:2409.09740</a> <span> [<a href="https://arxiv.org/pdf/2409.09740">pdf</a>, <a href="https://arxiv.org/format/2409.09740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VGG-Tex: A Vivid Geometry-Guided Facial Texture Estimation Model for High Fidelity Monocular 3D Face Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoyu Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Ziqiao Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xukun Zhou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yunfei Cheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jun He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyan Liu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhaoxin Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09740v2-abstract-short" style="display: inline;"> 3D face reconstruction from monocular images has promoted the development of various applications such as augmented reality. Though existing methods have made remarkable progress, most of them emphasize geometric reconstruction, while overlooking the importance of texture prediction. To address this issue, we propose VGG-Tex, a novel Vivid Geometry-Guided Facial Texture Estimation model designed f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09740v2-abstract-full').style.display = 'inline'; document.getElementById('2409.09740v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09740v2-abstract-full" style="display: none;"> 3D face reconstruction from monocular images has promoted the development of various applications such as augmented reality. Though existing methods have made remarkable progress, most of them emphasize geometric reconstruction, while overlooking the importance of texture prediction. To address this issue, we propose VGG-Tex, a novel Vivid Geometry-Guided Facial Texture Estimation model designed for High Fidelity Monocular 3D Face Reconstruction. The core of this approach is leveraging 3D parametric priors to enhance the outcomes of 2D UV texture estimation. Specifically, VGG-Tex includes a Facial Attributes Encoding Module, a Geometry-Guided Texture Generator, and a Visibility-Enhanced Texture Completion Module. These components are responsible for extracting parametric priors, generating initial textures, and refining texture details, respectively. Based on the geometry-texture complementarity principle, VGG-Tex also introduces a Texture-guided Geometry Refinement Module to further balance the overall fidelity of the reconstructed 3D faces, along with corresponding losses. Comprehensive experiments demonstrate that our method significantly improves texture reconstruction performance compared to existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09740v2-abstract-full').style.display = 'none'; document.getElementById('2409.09740v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09398">arXiv:2409.09398</a> <span> [<a href="https://arxiv.org/pdf/2409.09398">pdf</a>, <a href="https://arxiv.org/format/2409.09398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Language-Queried Target Sound Extraction Without Parallel Training Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hao Ma</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yukai Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+M">Mingjie Shao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Ju Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09398v1-abstract-short" style="display: inline;"> Language-queried target sound extraction (TSE) aims to extract specific sounds from mixtures based on language queries. Traditional fully-supervised training schemes require extensively annotated parallel audio-text data, which are labor-intensive. We introduce a language-free training scheme, requiring only unlabelled audio clips for TSE model training by utilizing the multi-modal representation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09398v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09398v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09398v1-abstract-full" style="display: none;"> Language-queried target sound extraction (TSE) aims to extract specific sounds from mixtures based on language queries. Traditional fully-supervised training schemes require extensively annotated parallel audio-text data, which are labor-intensive. We introduce a language-free training scheme, requiring only unlabelled audio clips for TSE model training by utilizing the multi-modal representation alignment nature of the contrastive language-audio pre-trained model (CLAP). In a vanilla language-free training stage, target audio is encoded using the pre-trained CLAP audio encoder to form a condition embedding for the TSE model, while during inference, user language queries are encoded by CLAP text encoder. This straightforward approach faces challenges due to the modality gap between training and inference queries and information leakage from direct exposure to target audio during training. To address this, we propose a retrieval-augmented strategy. Specifically, we create an embedding cache using audio captions generated by a large language model (LLM). During training, target audio embeddings retrieve text embeddings from this cache to use as condition embeddings, ensuring consistent modalities between training and inference and eliminating information leakage. Extensive experiment results show that our retrieval-augmented approach achieves consistent and notable performance improvements over existing state-of-the-art with better generalizability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09398v1-abstract-full').style.display = 'none'; document.getElementById('2409.09398v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08712">arXiv:2409.08712</a> <span> [<a href="https://arxiv.org/pdf/2409.08712">pdf</a>, <a href="https://arxiv.org/format/2409.08712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Layerwise Change of Knowledge in Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xu Cheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhaoran Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yang Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tian Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Quanshi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08712v1-abstract-short" style="display: inline;"> This paper aims to explain how a deep neural network (DNN) gradually extracts new knowledge and forgets noisy features through layers in forward propagation. Up to now, although the definition of knowledge encoded by the DNN has not reached a consensus, Previous studies have derived a series of mathematical evidence to take interactions as symbolic primitive inference patterns encoded by a DNN. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08712v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08712v1-abstract-full" style="display: none;"> This paper aims to explain how a deep neural network (DNN) gradually extracts new knowledge and forgets noisy features through layers in forward propagation. Up to now, although the definition of knowledge encoded by the DNN has not reached a consensus, Previous studies have derived a series of mathematical evidence to take interactions as symbolic primitive inference patterns encoded by a DNN. We extend the definition of interactions and, for the first time, extract interactions encoded by intermediate layers. We quantify and track the newly emerged interactions and the forgotten interactions in each layer during the forward propagation, which shed new light on the learning behavior of DNNs. The layer-wise change of interactions also reveals the change of the generalization capacity and instability of feature representations of a DNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08712v1-abstract-full').style.display = 'none'; document.getElementById('2409.08712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07869">arXiv:2409.07869</a> <span> [<a href="https://arxiv.org/pdf/2409.07869">pdf</a>, <a href="https://arxiv.org/format/2409.07869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning Rules from KGs Guided by Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zihang Peng</a>, <a href="/search/cs?searchtype=author&query=Stepanova%2C+D">Daria Stepanova</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+V+T">Vinh Thinh Ho</a>, <a href="/search/cs?searchtype=author&query=Adel%2C+H">Heike Adel</a>, <a href="/search/cs?searchtype=author&query=Russo%2C+A">Alessandra Russo</a>, <a href="/search/cs?searchtype=author&query=Ott%2C+S">Simon Ott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07869v1-abstract-short" style="display: inline;"> Advances in information extraction have enabled the automatic construction of large knowledge graphs (e.g., Yago, Wikidata or Google KG), which are widely used in many applications like semantic search or data analytics. However, due to their semi-automatic construction, KGs are often incomplete. Rule learning methods, concerned with the extraction of frequent patterns from KGs and casting them in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07869v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07869v1-abstract-full" style="display: none;"> Advances in information extraction have enabled the automatic construction of large knowledge graphs (e.g., Yago, Wikidata or Google KG), which are widely used in many applications like semantic search or data analytics. However, due to their semi-automatic construction, KGs are often incomplete. Rule learning methods, concerned with the extraction of frequent patterns from KGs and casting them into rules, can be applied to predict potentially missing facts. A crucial step in this process is rule ranking. Ranking of rules is especially challenging over highly incomplete or biased KGs (e.g., KGs predominantly storing facts about famous people), as in this case biased rules might fit the data best and be ranked at the top based on standard statistical metrics like rule confidence. To address this issue, prior works proposed to rank rules not only relying on the original KG but also facts predicted by a KG embedding model. At the same time, with the recent rise of Language Models (LMs), several works have claimed that LMs can be used as alternative means for KG completion. In this work, our goal is to verify to which extent the exploitation of LMs is helpful for improving the quality of rule learning systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07869v1-abstract-full').style.display = 'none'; document.getElementById('2409.07869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">proof of concept</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04965">arXiv:2409.04965</a> <span> [<a href="https://arxiv.org/pdf/2409.04965">pdf</a>, <a href="https://arxiv.org/format/2409.04965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Socially-Aware Robot Navigation through Bidirectional Natural Language Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+C">Congcong Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Bethala%2C+G+C+R">Geeta Chandra Raju Bethala</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zheng Peng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yi Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04965v1-abstract-short" style="display: inline;"> Robot navigation is an important research field with applications in various domains. However, traditional approaches often prioritize efficiency and obstacle avoidance, neglecting a nuanced understanding of human behavior or intent in shared spaces. With the rise of service robots, there's an increasing emphasis on endowing robots with the capability to navigate and interact in complex real-world… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04965v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04965v1-abstract-full" style="display: none;"> Robot navigation is an important research field with applications in various domains. However, traditional approaches often prioritize efficiency and obstacle avoidance, neglecting a nuanced understanding of human behavior or intent in shared spaces. With the rise of service robots, there's an increasing emphasis on endowing robots with the capability to navigate and interact in complex real-world environments. Socially aware navigation has recently become a key research area. However, existing work either predicts pedestrian movements or simply emits alert signals to pedestrians, falling short of facilitating genuine interactions between humans and robots. In this paper, we introduce the Hybrid Soft Actor-Critic with Large Language Model (HSAC-LLM), an innovative model designed for socially-aware navigation in robots. This model seamlessly integrates deep reinforcement learning with large language models, enabling it to predict both continuous and discrete actions for navigation. Notably, HSAC-LLM facilitates bidirectional interaction based on natural language with pedestrian models. When a potential collision with pedestrians is detected, the robot can initiate or respond to communications with pedestrians, obtaining and executing subsequent avoidance strategies. Experimental results in 2D simulation, the Gazebo environment, and the real-world environment demonstrate that HSAC-LLM not only efficiently enables interaction with humans but also exhibits superior performance in navigation and obstacle avoidance compared to state-of-the-art DRL algorithms. We believe this innovative paradigm opens up new avenues for effective and socially aware human-robot interactions in dynamic environments. Videos are available at https://hsacllm.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04965v1-abstract-full').style.display = 'none'; document.getElementById('2409.04965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03183">arXiv:2409.03183</a> <span> [<a href="https://arxiv.org/pdf/2409.03183">pdf</a>, <a href="https://arxiv.org/format/2409.03183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bypassing DARCY Defense: Indistinguishable Universal Adversarial Triggers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zuquan Peng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuanyuan He</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jianbing Ni</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+B">Ben Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03183v1-abstract-short" style="display: inline;"> Neural networks (NN) classification models for Natural Language Processing (NLP) are vulnerable to the Universal Adversarial Triggers (UAT) attack that triggers a model to produce a specific prediction for any input. DARCY borrows the "honeypot" concept to bait multiple trapdoors, effectively detecting the adversarial examples generated by UAT. Unfortunately, we find a new UAT generation method, c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03183v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03183v1-abstract-full" style="display: none;"> Neural networks (NN) classification models for Natural Language Processing (NLP) are vulnerable to the Universal Adversarial Triggers (UAT) attack that triggers a model to produce a specific prediction for any input. DARCY borrows the "honeypot" concept to bait multiple trapdoors, effectively detecting the adversarial examples generated by UAT. Unfortunately, we find a new UAT generation method, called IndisUAT, which produces triggers (i.e., tokens) and uses them to craft adversarial examples whose feature distribution is indistinguishable from that of the benign examples in a randomly-chosen category at the detection layer of DARCY. The produced adversarial examples incur the maximal loss of predicting results in the DARCY-protected models. Meanwhile, the produced triggers are effective in black-box models for text generation, text inference, and reading comprehension. Finally, the evaluation results under NN models for NLP tasks indicate that the IndisUAT method can effectively circumvent DARCY and penetrate other defenses. For example, IndisUAT can reduce the true positive rate of DARCY's detection by at least 40.8% and 90.6%, and drop the accuracy by at least 33.3% and 51.6% in the RNN and CNN models, respectively. IndisUAT reduces the accuracy of the BERT's adversarial defense model by at least 34.0%, and makes the GPT-2 language model spew racist outputs even when conditioned on non-racial context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03183v1-abstract-full').style.display = 'none'; document.getElementById('2409.03183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01555">arXiv:2409.01555</a> <span> [<a href="https://arxiv.org/pdf/2409.01555">pdf</a>, <a href="https://arxiv.org/format/2409.01555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EA-RAS: Towards Efficient and Accurate End-to-End Reconstruction of Anatomical Skeleton </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kai Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoran Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Li Ma</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Siyu Xia</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Changjie Fan</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+W">Weijian Shang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+W">Wei Jing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01555v1-abstract-short" style="display: inline;"> Efficient, accurate and low-cost estimation of human skeletal information is crucial for a range of applications such as biology education and human-computer interaction. However, current simple skeleton models, which are typically based on 2D-3D joint points, fall short in terms of anatomical fidelity, restricting their utility in fields. On the other hand, more complex models while anatomically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01555v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01555v1-abstract-full" style="display: none;"> Efficient, accurate and low-cost estimation of human skeletal information is crucial for a range of applications such as biology education and human-computer interaction. However, current simple skeleton models, which are typically based on 2D-3D joint points, fall short in terms of anatomical fidelity, restricting their utility in fields. On the other hand, more complex models while anatomically precise, are hindered by sophisticate multi-stage processing and the need for extra data like skin meshes, making them unsuitable for real-time applications. To this end, we propose the EA-RAS (Towards Efficient and Accurate End-to-End Reconstruction of Anatomical Skeleton), a single-stage, lightweight, and plug-and-play anatomical skeleton estimator that can provide real-time, accurate anatomically realistic skeletons with arbitrary pose using only a single RGB image input. Additionally, EA-RAS estimates the conventional human-mesh model explicitly, which not only enhances the functionality but also leverages the outside skin information by integrating features into the inside skeleton modeling process. In this work, we also develop a progressive training strategy and integrated it with an enhanced optimization process, enabling the network to obtain initial weights using only a small skin dataset and achieve self-supervision in skeleton reconstruction. Besides, we also provide an optional lightweight post-processing optimization strategy to further improve accuracy for scenarios that prioritize precision over real-time processing. The experiments demonstrated that our regression method is over 800 times faster than existing methods, meeting real-time requirements. Additionally, the post-processing optimization strategy provided can enhance reconstruction accuracy by over 50% and achieve a speed increase of more than 7 times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01555v1-abstract-full').style.display = 'none'; document.getElementById('2409.01555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16303">arXiv:2408.16303</a> <span> [<a href="https://arxiv.org/pdf/2408.16303">pdf</a>, <a href="https://arxiv.org/format/2408.16303">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Control for Diffusion Bridge in Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+C">Conghan Yue</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhengwei Peng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junlong Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16303v1-abstract-short" style="display: inline;"> Image restoration refers to the process of restoring a damaged low-quality image back to its corresponding high-quality image. Typically, we use convolutional neural networks to directly learn the mapping from low-quality images to high-quality images achieving image restoration. Recently, a special type of diffusion bridge model has achieved more advanced results in image restoration. It can tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16303v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16303v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16303v1-abstract-full" style="display: none;"> Image restoration refers to the process of restoring a damaged low-quality image back to its corresponding high-quality image. Typically, we use convolutional neural networks to directly learn the mapping from low-quality images to high-quality images achieving image restoration. Recently, a special type of diffusion bridge model has achieved more advanced results in image restoration. It can transform the direct mapping from low-quality to high-quality images into a diffusion process, restoring low-quality images through a reverse process. However, the current diffusion bridge restoration models do not emphasize the idea of conditional control, which may affect performance. This paper introduces the ECDB model enhancing the control of the diffusion bridge with low-quality images as conditions. Moreover, in response to the characteristic of diffusion models having low denoising level at larger values of $\bm t $, we also propose a Conditional Fusion Schedule, which more effectively handles the conditional feature information of various modules. Experimental results prove that the ECDB model has achieved state-of-the-art results in many image restoration tasks, including deraining, inpainting and super-resolution. Code is avaliable at https://github.com/Hammour-steak/ECDB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16303v1-abstract-full').style.display = 'none'; document.getElementById('2408.16303v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14144">arXiv:2408.14144</a> <span> [<a href="https://arxiv.org/pdf/2408.14144">pdf</a>, <a href="https://arxiv.org/format/2408.14144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Neighborhood and Global Perturbations Supported SAM in Federated Learning: From Local Tweaks To Global Awareness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyuan Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zihao Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yafei Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingliang Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengbo Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+B">Baofeng Ji</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Cong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14144v2-abstract-short" style="display: inline;"> Federated Learning (FL) can be coordinated under the orchestration of a central server to collaboratively build a privacy-preserving model without the need for data exchange. However, participant data heterogeneity leads to local optima divergence, subsequently affecting convergence outcomes. Recent research has focused on global sharpness-aware minimization (SAM) and dynamic regularization techni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14144v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14144v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14144v2-abstract-full" style="display: none;"> Federated Learning (FL) can be coordinated under the orchestration of a central server to collaboratively build a privacy-preserving model without the need for data exchange. However, participant data heterogeneity leads to local optima divergence, subsequently affecting convergence outcomes. Recent research has focused on global sharpness-aware minimization (SAM) and dynamic regularization techniques to enhance consistency between global and local generalization and optimization objectives. Nonetheless, the estimation of global SAM introduces additional computational and memory overhead, while dynamic regularization suffers from bias in the local and global dual variables due to training isolation. In this paper, we propose a novel FL algorithm, FedTOGA, designed to consider optimization and generalization objectives while maintaining minimal uplink communication overhead. By linking local perturbations to global updates, global generalization consistency is improved. Additionally, global updates are used to correct local dynamic regularizers, reducing dual variables bias and enhancing optimization consistency. Global updates are passively received by clients, reducing overhead. We also propose neighborhood perturbation to approximate local perturbation, analyzing its strengths and limitations. Theoretical analysis shows FedTOGA achieves faster convergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate that FedTOGA outperforms state-of-the-art algorithms, with a 1\% accuracy increase and 30\% faster convergence, achieving state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14144v2-abstract-full').style.display = 'none'; document.getElementById('2408.14144v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12880">arXiv:2408.12880</a> <span> [<a href="https://arxiv.org/pdf/2408.12880">pdf</a>, <a href="https://arxiv.org/format/2408.12880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Has Multimodal Learning Delivered Universal Intelligence in Healthcare? A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qika Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yifan Zhu</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+X">Xin Mei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Ling Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jingying Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Kai He</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhen Peng</a>, <a href="/search/cs?searchtype=author&query=Cambria%2C+E">Erik Cambria</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengling Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12880v1-abstract-short" style="display: inline;"> The rapid development of artificial intelligence has constantly reshaped the field of intelligent healthcare and medicine. As a vital technology, multimodal learning has increasingly garnered interest due to data complementarity, comprehensive modeling form, and great application potential. Currently, numerous researchers are dedicating their attention to this field, conducting extensive studies a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12880v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12880v1-abstract-full" style="display: none;"> The rapid development of artificial intelligence has constantly reshaped the field of intelligent healthcare and medicine. As a vital technology, multimodal learning has increasingly garnered interest due to data complementarity, comprehensive modeling form, and great application potential. Currently, numerous researchers are dedicating their attention to this field, conducting extensive studies and constructing abundant intelligent systems. Naturally, an open question arises that has multimodal learning delivered universal intelligence in healthcare? To answer the question, we adopt three unique viewpoints for a holistic analysis. Firstly, we conduct a comprehensive survey of the current progress of medical multimodal learning from the perspectives of datasets, task-oriented methods, and universal foundation models. Based on them, we further discuss the proposed question from five issues to explore the real impacts of advanced techniques in healthcare, from data and technologies to performance and ethics. The answer is that current technologies have NOT achieved universal intelligence and there remains a significant journey to undertake. Finally, in light of the above reviews and discussions, we point out ten potential directions for exploration towards the goal of universal intelligence in healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12880v1-abstract-full').style.display = 'none'; document.getElementById('2408.12880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11744">arXiv:2408.11744</a> <span> [<a href="https://arxiv.org/pdf/2408.11744">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JieHua Paintings Style Feature Extracting Model using Stable Diffusion with ControlNet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yujia Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haofeng Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zihan Peng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yinan Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11744v1-abstract-short" style="display: inline;"> This study proposes a novel approach to extract stylistic features of Jiehua: the utilization of the Fine-tuned Stable Diffusion Model with ControlNet (FSDMC) to refine depiction techniques from artists' Jiehua. The training data for FSDMC is based on the opensource Jiehua artist's work collected from the Internet, which were subsequently manually constructed in the format of (Original Image, Cann… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11744v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11744v1-abstract-full" style="display: none;"> This study proposes a novel approach to extract stylistic features of Jiehua: the utilization of the Fine-tuned Stable Diffusion Model with ControlNet (FSDMC) to refine depiction techniques from artists' Jiehua. The training data for FSDMC is based on the opensource Jiehua artist's work collected from the Internet, which were subsequently manually constructed in the format of (Original Image, Canny Edge Features, Text Prompt). By employing the optimal hyperparameters identified in this paper, it was observed FSDMC outperforms CycleGAN, another mainstream style transfer model. FSDMC achieves FID of 3.27 on the dataset and also surpasses CycleGAN in terms of expert evaluation. This not only demonstrates the model's high effectiveness in extracting Jiehua's style features, but also preserves the original pre-trained semantic information. The findings of this study suggest that the application of FSDMC with appropriate hyperparameters can enhance the efficacy of the Stable Diffusion Model in the field of traditional art style migration tasks, particularly within the context of Jiehua. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11744v1-abstract-full').style.display = 'none'; document.getElementById('2408.11744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICCSMT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09357">arXiv:2408.09357</a> <span> [<a href="https://arxiv.org/pdf/2408.09357">pdf</a>, <a href="https://arxiv.org/format/2408.09357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Meta-Learning Empowered Meta-Face: Personalized Speaking Style Adaptation for Audio-Driven 3D Talking Face Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xukun Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fengxin Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Ziqiao Peng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kejian Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jun He</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Biao Qin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhaoxin Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09357v1-abstract-short" style="display: inline;"> Audio-driven 3D face animation is increasingly vital in live streaming and augmented reality applications. While remarkable progress has been observed, most existing approaches are designed for specific individuals with predefined speaking styles, thus neglecting the adaptability to varied speaking styles. To address this limitation, this paper introduces MetaFace, a novel methodology meticulously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09357v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09357v1-abstract-full" style="display: none;"> Audio-driven 3D face animation is increasingly vital in live streaming and augmented reality applications. While remarkable progress has been observed, most existing approaches are designed for specific individuals with predefined speaking styles, thus neglecting the adaptability to varied speaking styles. To address this limitation, this paper introduces MetaFace, a novel methodology meticulously crafted for speaking style adaptation. Grounded in the novel concept of meta-learning, MetaFace is composed of several key components: the Robust Meta Initialization Stage (RMIS) for fundamental speaking style adaptation, the Dynamic Relation Mining Neural Process (DRMN) for forging connections between observed and unobserved speaking styles, and the Low-rank Matrix Memory Reduction Approach to enhance the efficiency of model optimization as well as learning style details. Leveraging these novel designs, MetaFace not only significantly outperforms robust existing baselines but also establishes a new state-of-the-art, as substantiated by our experimental results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09357v1-abstract-full').style.display = 'none'; document.getElementById('2408.09357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08561">arXiv:2408.08561</a> <span> [<a href="https://arxiv.org/pdf/2408.08561">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A New Chinese Landscape Paintings Generation Model based on Stable Diffusion using DreamBooth </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yujia Gu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xueyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zihan Peng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yinan Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08561v3-abstract-short" style="display: inline;"> This study mainly introduces a method combining the Stable Diffusion Model (SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese Landscape Paintings. This training process is accelerated by combining LoRA with pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the Chinese Landscape Paintings Internet dataset used in this paper, this study finds that SDM combine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08561v3-abstract-full').style.display = 'inline'; document.getElementById('2408.08561v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08561v3-abstract-full" style="display: none;"> This study mainly introduces a method combining the Stable Diffusion Model (SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese Landscape Paintings. This training process is accelerated by combining LoRA with pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the Chinese Landscape Paintings Internet dataset used in this paper, this study finds that SDM combined with DreamBooth exhibits superior performance, outperforming other models, including the generic pre-trained SDM and LoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of 12.75 on the dataset and outperforms all other models in terms of expert evaluation, highlighting the model's versatility in the field of Chinese Landscape Paintings given the unique identifier, high fidelity and high quality. This study illustrates the potential of specialised fine-tuning method to improve the performance of SDM on domain-specific tasks, particularly in the domain of Landscape Paintings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08561v3-abstract-full').style.display = 'none'; document.getElementById('2408.08561v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by AHPCAI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08444">arXiv:2408.08444</a> <span> [<a href="https://arxiv.org/pdf/2408.08444">pdf</a>, <a href="https://arxiv.org/format/2408.08444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nian%2C+J">Jinming Nian</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yi Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08444v1-abstract-short" style="display: inline;"> In knowledge-intensive tasks such as open-domain question answering (OpenQA), Large Language Models (LLMs) often struggle to generate factual answers relying solely on their internal (parametric) knowledge. To address this limitation, Retrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving relevant information from external sources, thereby positioning the retriever as a pivotal co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08444v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08444v1-abstract-full" style="display: none;"> In knowledge-intensive tasks such as open-domain question answering (OpenQA), Large Language Models (LLMs) often struggle to generate factual answers relying solely on their internal (parametric) knowledge. To address this limitation, Retrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving relevant information from external sources, thereby positioning the retriever as a pivotal component. Although dense retrieval demonstrates state-of-the-art performance, its training poses challenges due to the scarcity of ground-truth evidence, largely attributed to the high costs of human annotation. In this paper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create weakly labeled data for training dense retrievers. Specifically, we rerank the top-$K$ passages retrieved via BM25 by assessing the probability that LLMs will generate the correct answer based on the question and each passage. The highest-ranking passages are then used as positive training examples for dense retrieval. Our comprehensive experiments across four publicly available OpenQA datasets demonstrate that our approach enhances both retrieval and OpenQA performance compared to baseline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08444v1-abstract-full').style.display = 'none'; document.getElementById('2408.08444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04325">arXiv:2408.04325</a> <span> [<a href="https://arxiv.org/pdf/2408.04325">pdf</a>, <a href="https://arxiv.org/format/2408.04325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HydraFormer: One Encoder For All Subsampling Rates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yaoxun Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xingchen Song</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhendong Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04325v1-abstract-short" style="display: inline;"> In automatic speech recognition, subsampling is essential for tackling diverse scenarios. However, the inadequacy of a single subsampling rate to address various real-world situations often necessitates training and deploying multiple models, consequently increasing associated costs. To address this issue, we propose HydraFormer, comprising HydraSub, a Conformer-based encoder, and a BiTransformer-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04325v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04325v1-abstract-full" style="display: none;"> In automatic speech recognition, subsampling is essential for tackling diverse scenarios. However, the inadequacy of a single subsampling rate to address various real-world situations often necessitates training and deploying multiple models, consequently increasing associated costs. To address this issue, we propose HydraFormer, comprising HydraSub, a Conformer-based encoder, and a BiTransformer-based decoder. HydraSub encompasses multiple branches, each representing a distinct subsampling rate, allowing for the flexible selection of any branch during inference based on the specific use case. HydraFormer can efficiently manage different subsampling rates, significantly reducing training and deployment expenses. Experiments on AISHELL-1 and LibriSpeech datasets reveal that HydraFormer effectively adapts to various subsampling rates and languages while maintaining high recognition performance. Additionally, HydraFormer showcases exceptional stability, sustaining consistent performance under various initialization conditions, and exhibits robust transferability by learning from pretrained single subsampling rate automatic speech recognition models\footnote{Model code and scripts: https://github.com/HydraFormer/hydraformer}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04325v1-abstract-full').style.display = 'none'; document.getElementById('2408.04325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICME 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03191">arXiv:2408.03191</a> <span> [<a href="https://arxiv.org/pdf/2408.03191">pdf</a>, <a href="https://arxiv.org/format/2408.03191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Integrated Intention Prediction and Decision-Making with Spectrum Attention Net and Proximal Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao Zhou</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+C">Chengzhen Meng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenru Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zengqi Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03191v1-abstract-short" style="display: inline;"> For autonomous driving in highly dynamic environments, it is anticipated to predict the future behaviors of surrounding vehicles (SVs) and make safe and effective decisions. However, modeling the inherent coupling effect between the prediction and decision-making modules has been a long-standing challenge, especially when there is a need to maintain appropriate computational efficiency. To tackle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03191v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03191v1-abstract-full" style="display: none;"> For autonomous driving in highly dynamic environments, it is anticipated to predict the future behaviors of surrounding vehicles (SVs) and make safe and effective decisions. However, modeling the inherent coupling effect between the prediction and decision-making modules has been a long-standing challenge, especially when there is a need to maintain appropriate computational efficiency. To tackle these problems, we propose a novel integrated intention prediction and decision-making approach, which explicitly models the coupling relationship and achieves efficient computation. Specifically, a spectrum attention net is designed to predict the intentions of SVs by capturing the trends of each frequency component over time and their interrelations. Fast computation of the intention prediction module is attained as the predicted intentions are not decoded to trajectories in the executing process. Furthermore, the proximal policy optimization (PPO) algorithm is employed to address the non-stationary problem in the framework through a modest policy update enabled by a clipping mechanism within its objective function. On the basis of these developments, the intention prediction and decision-making modules are integrated through joint learning. Experiments are conducted in representative traffic scenarios, and the results reveal that the proposed integrated framework demonstrates superior performance over several deep reinforcement learning (DRL) baselines in terms of success rate, efficiency, and safety in driving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03191v1-abstract-full').style.display = 'none'; document.getElementById('2408.03191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01102">arXiv:2408.01102</a> <span> [<a href="https://arxiv.org/pdf/2408.01102">pdf</a>, <a href="https://arxiv.org/format/2408.01102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3654777.3676390">10.1145/3654777.3676390 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LessonPlanner: Assisting Novice Teachers to Prepare Pedagogy-Driven Lesson Plans with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+H">Haoxiang Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanzheng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingbo Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenhui Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01102v1-abstract-short" style="display: inline;"> Preparing a lesson plan, e.g., a detailed road map with strategies and materials for instructing a 90-minute class, is beneficial yet challenging for novice teachers. Large language models (LLMs) can ease this process by generating adaptive content for lesson plans, which would otherwise require teachers to create from scratch or search existing resources. In this work, we first conduct a formativ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01102v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01102v1-abstract-full" style="display: none;"> Preparing a lesson plan, e.g., a detailed road map with strategies and materials for instructing a 90-minute class, is beneficial yet challenging for novice teachers. Large language models (LLMs) can ease this process by generating adaptive content for lesson plans, which would otherwise require teachers to create from scratch or search existing resources. In this work, we first conduct a formative study with six novice teachers to understand their needs for support of preparing lesson plans with LLMs. Then, we develop LessonPlanner that assists users to interactively construct lesson plans with adaptive LLM-generated content based on Gagne's nine events. Our within-subjects study (N=12) shows that compared to the baseline ChatGPT interface, LessonPlanner can significantly improve the quality of outcome lesson plans and ease users' workload in the preparation process. Our expert interviews (N=6) further demonstrate LessonPlanner's usefulness in suggesting effective teaching strategies and meaningful educational resources. We discuss concerns on and design considerations for supporting teaching activities with LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01102v1-abstract-full').style.display = 'none'; document.getElementById('2408.01102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20053">arXiv:2407.20053</a> <span> [<a href="https://arxiv.org/pdf/2407.20053">pdf</a>, <a href="https://arxiv.org/format/2407.20053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Orca: Ocean Significant Wave Height Estimation with Spatio-temporally Aware Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ronghui Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jilin Hu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhong Peng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xi Lu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenjuan Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20053v1-abstract-short" style="display: inline;"> Significant wave height (SWH) is a vital metric in marine science, and accurate SWH estimation is crucial for various applications, e.g., marine energy development, fishery, early warning systems for potential risks, etc. Traditional SWH estimation methods that are based on numerical models and physical theories are hindered by computational inefficiencies. Recently, machine learning has emerged a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20053v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20053v1-abstract-full" style="display: none;"> Significant wave height (SWH) is a vital metric in marine science, and accurate SWH estimation is crucial for various applications, e.g., marine energy development, fishery, early warning systems for potential risks, etc. Traditional SWH estimation methods that are based on numerical models and physical theories are hindered by computational inefficiencies. Recently, machine learning has emerged as an appealing alternative to improve accuracy and reduce computational time. However, due to limited observational technology and high costs, the scarcity of real-world data restricts the potential of machine learning models. To overcome these limitations, we propose an ocean SWH estimation framework, namely Orca. Specifically, Orca enhances the limited spatio-temporal reasoning abilities of classic LLMs with a novel spatiotemporal aware encoding module. By segmenting the limited buoy observational data temporally, encoding the buoys' locations spatially, and designing prompt templates, Orca capitalizes on the robust generalization ability of LLMs to estimate significant wave height effectively with limited data. Experimental results on the Gulf of Mexico demonstrate that Orca achieves state-of-the-art performance in SWH estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20053v1-abstract-full').style.display = 'none'; document.getElementById('2407.20053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18487">arXiv:2407.18487</a> <span> [<a href="https://arxiv.org/pdf/2407.18487">pdf</a>, <a href="https://arxiv.org/format/2407.18487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SMPISD-MTPNet: Scene Semantic Prior-Assisted Infrared Ship Detection Using Multi-Task Perception Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaogang Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+H+L">Yian Huang Lele Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liang Xu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+T">Tian Pu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenming Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18487v1-abstract-short" style="display: inline;"> Infrared ship detection (IRSD) has received increasing attention in recent years due to the robustness of infrared images to adverse weather. However, a large number of false alarms may occur in complex scenes. To address these challenges, we propose the Scene Semantic Prior-Assisted Multi-Task Perception Network (SMPISD-MTPNet), which includes three stages: scene semantic extraction, deep feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18487v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18487v1-abstract-full" style="display: none;"> Infrared ship detection (IRSD) has received increasing attention in recent years due to the robustness of infrared images to adverse weather. However, a large number of false alarms may occur in complex scenes. To address these challenges, we propose the Scene Semantic Prior-Assisted Multi-Task Perception Network (SMPISD-MTPNet), which includes three stages: scene semantic extraction, deep feature extraction, and prediction. In the scene semantic extraction stage, we employ a Scene Semantic Extractor (SSE) to guide the network by the features extracted based on expert knowledge. In the deep feature extraction stage, a backbone network is employed to extract deep features. These features are subsequently integrated by a fusion network, enhancing the detection capabilities across targets of varying sizes. In the prediction stage, we utilize the Multi-Task Perception Module, which includes the Gradient-based Module and the Scene Segmentation Module, enabling precise detection of small and dim targets within complex scenes. For the training process, we introduce the Soft Fine-tuning training strategy to suppress the distortion caused by data augmentation. Besides, due to the lack of a publicly available dataset labelled for scenes, we introduce the Infrared Ship Dataset with Scene Segmentation (IRSDSS). Finally, we evaluate the network and compare it with state-of-the-art (SOTA) methods, indicating that SMPISD-MTPNet outperforms existing approaches. The source code and dataset for this research can be accessed at https://github.com/greekinRoma/KMNDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18487v1-abstract-full').style.display = 'none'; document.getElementById('2407.18487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18064">arXiv:2407.18064</a> <span> [<a href="https://arxiv.org/pdf/2407.18064">pdf</a>, <a href="https://arxiv.org/format/2407.18064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ComPeer: A Generative Conversational Agent for Proactive Peer Support </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianjian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongzheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingbo Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenhui Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18064v2-abstract-short" style="display: inline;"> Conversational Agents (CAs) acting as peer supporters have been widely studied and demonstrated beneficial for people's mental health. However, previous peer support CAs either are user-initiated or follow predefined rules to initiate the conversations, which may discourage users to engage and build relationships with the CAs for long-term benefits. In this paper, we develop ComPeer, a generative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18064v2-abstract-full').style.display = 'inline'; document.getElementById('2407.18064v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18064v2-abstract-full" style="display: none;"> Conversational Agents (CAs) acting as peer supporters have been widely studied and demonstrated beneficial for people's mental health. However, previous peer support CAs either are user-initiated or follow predefined rules to initiate the conversations, which may discourage users to engage and build relationships with the CAs for long-term benefits. In this paper, we develop ComPeer, a generative CA that can proactively offer adaptive peer support to users. ComPeer leverages large language models to detect and reflect significant events in the dialogue, enabling it to strategically plan the timing and content of proactive care. In addition, ComPeer incorporates peer support strategies, conversation history, and its persona into the generative messages. Our one-week between-subjects study (N=24) demonstrates ComPeer's strength in providing peer support over time and boosting users' engagement compared to a baseline user-initiated CA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18064v2-abstract-full').style.display = 'none'; document.getElementById('2407.18064v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at the 2024 ACM Symposium on User Interface Software and Technology (UIST); 22 pages (7 figures, 7 tables)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16397">arXiv:2407.16397</a> <span> [<a href="https://arxiv.org/pdf/2407.16397">pdf</a>, <a href="https://arxiv.org/format/2407.16397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On ADMM in Heterogeneous Federated Learning: Personalization, Robustness, and Fairness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shengkun Zhu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jinshan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuan Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaodong Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyong Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16397v1-abstract-short" style="display: inline;"> Statistical heterogeneity is a root cause of tension among accuracy, fairness, and robustness of federated learning (FL), and is key in paving a path forward. Personalized FL (PFL) is an approach that aims to reduce the impact of statistical heterogeneity by developing personalized models for individual users, while also inherently providing benefits in terms of fairness and robustness. However, e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16397v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16397v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16397v1-abstract-full" style="display: none;"> Statistical heterogeneity is a root cause of tension among accuracy, fairness, and robustness of federated learning (FL), and is key in paving a path forward. Personalized FL (PFL) is an approach that aims to reduce the impact of statistical heterogeneity by developing personalized models for individual users, while also inherently providing benefits in terms of fairness and robustness. However, existing PFL frameworks focus on improving the performance of personalized models while neglecting the global model. Moreover, these frameworks achieve sublinear convergence rates and rely on strong assumptions. In this paper, we propose FLAME, an optimization framework by utilizing the alternating direction method of multipliers (ADMM) to train personalized and global models. We propose a model selection strategy to improve performance in situations where clients have different types of heterogeneous data. Our theoretical analysis establishes the global convergence and two kinds of convergence rates for FLAME under mild assumptions. We theoretically demonstrate that FLAME is more robust and fair than the state-of-the-art methods on a class of linear problems. Our experimental findings show that FLAME outperforms state-of-the-art methods in convergence and accuracy, and it achieves higher test accuracy under various attacks and performs more uniformly across clients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16397v1-abstract-full').style.display = 'none'; document.getElementById('2407.16397v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2311.06756</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13309">arXiv:2407.13309</a> <span> [<a href="https://arxiv.org/pdf/2407.13309">pdf</a>, <a href="https://arxiv.org/format/2407.13309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3680935">10.1145/3664647.3680935 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Exposure Completing for Temporally Consistent Neural High Dynamic Range Video Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiahao Cui</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhan Peng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhiyu Pan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhiguo Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13309v2-abstract-short" style="display: inline;"> High dynamic range (HDR) video rendering from low dynamic range (LDR) videos where frames are of alternate exposure encounters significant challenges, due to the exposure change and absence at each time stamp. The exposure change and absence make existing methods generate flickering HDR results. In this paper, we propose a novel paradigm to render HDR frames via completing the absent exposure info… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13309v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13309v2-abstract-full" style="display: none;"> High dynamic range (HDR) video rendering from low dynamic range (LDR) videos where frames are of alternate exposure encounters significant challenges, due to the exposure change and absence at each time stamp. The exposure change and absence make existing methods generate flickering HDR results. In this paper, we propose a novel paradigm to render HDR frames via completing the absent exposure information, hence the exposure information is complete and consistent. Our approach involves interpolating neighbor LDR frames in the time dimension to reconstruct LDR frames for the absent exposures. Combining the interpolated and given LDR frames, the complete set of exposure information is available at each time stamp. This benefits the fusing process for HDR results, reducing noise and ghosting artifacts therefore improving temporal consistency. Extensive experimental evaluations on standard benchmarks demonstrate that our method achieves state-of-the-art performance, highlighting the importance of absent exposure completing in HDR video rendering. The code is available at https://github.com/cuijiahao666/NECHDR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13309v2-abstract-full').style.display = 'none'; document.getElementById('2407.13309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, accepted by ACM-MM 2024 (poster)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12051">arXiv:2407.12051</a> <span> [<a href="https://arxiv.org/pdf/2407.12051">pdf</a>, <a href="https://arxiv.org/format/2407.12051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dy-mer: An Explainable DNA Sequence Representation Scheme using Sparse Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuanbo Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12051v1-abstract-short" style="display: inline;"> DNA sequences encode vital genetic and biological information, yet these unfixed-length sequences cannot serve as the input of common data mining algorithms. Hence, various representation schemes have been developed to transform DNA sequences into fixed-length numerical representations. However, these schemes face difficulties in learning high-quality representations due to the complexity and spar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12051v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12051v1-abstract-full" style="display: none;"> DNA sequences encode vital genetic and biological information, yet these unfixed-length sequences cannot serve as the input of common data mining algorithms. Hence, various representation schemes have been developed to transform DNA sequences into fixed-length numerical representations. However, these schemes face difficulties in learning high-quality representations due to the complexity and sparsity of DNA data. Additionally, DNA sequences are inherently noisy because of mutations. While several schemes have been proposed for their effectiveness, they often lack semantic structure, making it difficult for biologists to validate and leverage the results. To address these challenges, we propose \textbf{Dy-mer}, an explainable and robust DNA representation scheme based on sparse recovery. Leveraging the underlying semantic structure of DNA, we modify the traditional sparse recovery to capture recurring patterns indicative of biological functions by representing frequent K-mers as basis vectors and reconstructing each DNA sequence through simple concatenation. Experimental results demonstrate that \textbf{Dy-mer} achieves state-of-the-art performance in DNA promoter classification, yielding a remarkable \textbf{13\%} increase in accuracy. Moreover, its inherent explainability facilitates DNA clustering and motif detection, enhancing its utility in biological research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12051v1-abstract-full').style.display = 'none'; document.getElementById('2407.12051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05726">arXiv:2407.05726</a> <span> [<a href="https://arxiv.org/pdf/2407.05726">pdf</a>, <a href="https://arxiv.org/format/2407.05726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Gait Patterns as Biomarkers: A Video-Based Approach for Classifying Scoliosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zirui Zhou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Junhao Liang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zizhao Peng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Chao Fan</a>, <a href="/search/cs?searchtype=author&query=An%2C+F">Fengwei An</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shiqi Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05726v3-abstract-short" style="display: inline;"> Scoliosis presents significant diagnostic challenges, particularly in adolescents, where early detection is crucial for effective treatment. Traditional diagnostic and follow-up methods, which rely on physical examinations and radiography, face limitations due to the need for clinical expertise and the risk of radiation exposure, thus restricting their use for widespread early screening. In respon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05726v3-abstract-full').style.display = 'inline'; document.getElementById('2407.05726v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05726v3-abstract-full" style="display: none;"> Scoliosis presents significant diagnostic challenges, particularly in adolescents, where early detection is crucial for effective treatment. Traditional diagnostic and follow-up methods, which rely on physical examinations and radiography, face limitations due to the need for clinical expertise and the risk of radiation exposure, thus restricting their use for widespread early screening. In response, we introduce a novel video-based, non-invasive method for scoliosis classification using gait analysis, effectively circumventing these limitations. This study presents Scoliosis1K, the first large-scale dataset specifically designed for video-based scoliosis classification, encompassing over one thousand adolescents. Leveraging this dataset, we developed ScoNet, an initial model that faced challenges in handling the complexities of real-world data. This led to the development of ScoNet-MT, an enhanced model incorporating multi-task learning, which demonstrates promising diagnostic accuracy for practical applications. Our findings demonstrate that gait can serve as a non-invasive biomarker for scoliosis, revolutionizing screening practices through deep learning and setting a precedent for non-invasive diagnostic methodologies. The dataset and code are publicly available at https://zhouzi180.github.io/Scoliosis1K/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05726v3-abstract-full').style.display = 'none'; document.getElementById('2407.05726v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00286">arXiv:2407.00286</a> <span> [<a href="https://arxiv.org/pdf/2407.00286">pdf</a>, <a href="https://arxiv.org/format/2407.00286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Digital Twin-Assisted Data-Driven Optimization for Reliable Edge Caching in Wireless Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuchen Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingzhe Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongkuan Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00286v1-abstract-short" style="display: inline;"> Optimizing edge caching is crucial for the advancement of next-generation (nextG) wireless networks, ensuring high-speed and low-latency services for mobile users. Existing data-driven optimization approaches often lack awareness of the distribution of random data variables and focus solely on optimizing cache hit rates, neglecting potential reliability concerns, such as base station overload and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00286v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00286v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00286v1-abstract-full" style="display: none;"> Optimizing edge caching is crucial for the advancement of next-generation (nextG) wireless networks, ensuring high-speed and low-latency services for mobile users. Existing data-driven optimization approaches often lack awareness of the distribution of random data variables and focus solely on optimizing cache hit rates, neglecting potential reliability concerns, such as base station overload and unbalanced cache issues. This oversight can result in system crashes and degraded user experience. To bridge this gap, we introduce a novel digital twin-assisted optimization framework, called D-REC, which integrates reinforcement learning (RL) with diverse intervention modules to ensure reliable caching in nextG wireless networks. We first develop a joint vertical and horizontal twinning approach to efficiently create network digital twins, which are then employed by D-REC as RL optimizers and safeguards, providing ample datasets for training and predictive evaluation of our cache replacement policy. By incorporating reliability modules into a constrained Markov decision process, D-REC can adaptively adjust actions, rewards, and states to comply with advantageous constraints, minimizing the risk of network failures. Theoretical analysis demonstrates comparable convergence rates between D-REC and vanilla data-driven methods without compromising caching performance. Extensive experiments validate that D-REC outperforms conventional approaches in cache hit rate and load balancing while effectively enforcing predetermined reliability intervention modules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00286v1-abstract-full').style.display = 'none'; document.getElementById('2407.00286v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Journal on Selected Areas in Communications (JSAC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19195">arXiv:2406.19195</a> <span> [<a href="https://arxiv.org/pdf/2406.19195">pdf</a>, <a href="https://arxiv.org/format/2406.19195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Estimating Long-term Heterogeneous Dose-response Curve: Generalization Bound Leveraging Optimal Transport Weights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zeqin Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weilin Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Ruichu Cai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuguang Yan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhifeng Hao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhipeng Yu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhichao Zou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhen Peng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiecheng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19195v1-abstract-short" style="display: inline;"> Long-term causal effect estimation is a significant but challenging problem in many applications. Existing methods rely on ideal assumptions to estimate long-term average effects, e.g., no unobserved confounders or a binary treatment,while in numerous real-world applications, these assumptions could be violated and average effects are unable to provide individual-level suggestions.In this paper,we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19195v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19195v1-abstract-full" style="display: none;"> Long-term causal effect estimation is a significant but challenging problem in many applications. Existing methods rely on ideal assumptions to estimate long-term average effects, e.g., no unobserved confounders or a binary treatment,while in numerous real-world applications, these assumptions could be violated and average effects are unable to provide individual-level suggestions.In this paper,we address a more general problem of estimating the long-term heterogeneous dose-response curve (HDRC) while accounting for unobserved confounders. Specifically, to remove unobserved confounding in observational data, we introduce an optimal transport weighting framework to align the observational data to the experimental data with theoretical guarantees. Furthermore,to accurately predict the heterogeneous effects of continuous treatment, we establish a generalization bound on counterfactual prediction error by leveraging the reweighted distribution induced by optimal transport. Finally, we develop an HDRC estimator building upon the above theoretical foundations. Extensive experimental studies conducted on multiple synthetic and semi-synthetic datasets demonstrate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19195v1-abstract-full').style.display = 'none'; document.getElementById('2406.19195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16907">arXiv:2406.16907</a> <span> [<a href="https://arxiv.org/pdf/2406.16907">pdf</a>, <a href="https://arxiv.org/format/2406.16907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RayProNet: A Neural Point Field Framework for Radio Propagation Modeling in 3D Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+G">Ge Cao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhen Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16907v1-abstract-short" style="display: inline;"> The radio wave propagation channel is central to the performance of wireless communication systems. In this paper, we introduce a novel machine learning-empowered methodology for wireless channel modeling. The key ingredients include a point-cloud-based neural network and a Spherical Harmonics encoder with light probes. Our approach offers several significant advantages, including the flexibility… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16907v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16907v1-abstract-full" style="display: none;"> The radio wave propagation channel is central to the performance of wireless communication systems. In this paper, we introduce a novel machine learning-empowered methodology for wireless channel modeling. The key ingredients include a point-cloud-based neural network and a Spherical Harmonics encoder with light probes. Our approach offers several significant advantages, including the flexibility to adjust antenna radiation patterns and transmitter/receiver locations, the capability to predict radio power maps, and the scalability of large-scale wireless scenes. As a result, it lays the groundwork for an end-to-end pipeline for network planning and deployment optimization. The proposed work is validated in various outdoor and indoor radio environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16907v1-abstract-full').style.display = 'none'; document.getElementById('2406.16907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16866">arXiv:2406.16866</a> <span> [<a href="https://arxiv.org/pdf/2406.16866">pdf</a>, <a href="https://arxiv.org/format/2406.16866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Referring Expression Comprehension Evaluation in the Era of Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jierun Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Fangyun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinjing Zhao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Sizhe Song</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohuai Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhuoxuan Peng</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+S+-+G">S. -H. Gary Chan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16866v1-abstract-short" style="display: inline;"> Referring expression comprehension (REC) involves localizing a target instance based on a textual description. Recent advancements in REC have been driven by large multimodal models (LMMs) like CogVLM, which achieved 92.44% accuracy on RefCOCO. However, this study questions whether existing benchmarks such as RefCOCO, RefCOCO+, and RefCOCOg, capture LMMs' comprehensive capabilities. We begin with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16866v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16866v1-abstract-full" style="display: none;"> Referring expression comprehension (REC) involves localizing a target instance based on a textual description. Recent advancements in REC have been driven by large multimodal models (LMMs) like CogVLM, which achieved 92.44% accuracy on RefCOCO. However, this study questions whether existing benchmarks such as RefCOCO, RefCOCO+, and RefCOCOg, capture LMMs' comprehensive capabilities. We begin with a manual examination of these benchmarks, revealing high labeling error rates: 14% in RefCOCO, 24% in RefCOCO+, and 5% in RefCOCOg, which undermines the authenticity of evaluations. We address this by excluding problematic instances and reevaluating several LMMs capable of handling the REC task, showing significant accuracy improvements, thus highlighting the impact of benchmark noise. In response, we introduce Ref-L4, a comprehensive REC benchmark, specifically designed to evaluate modern REC models. Ref-L4 is distinguished by four key features: 1) a substantial sample size with 45,341 annotations; 2) a diverse range of object categories with 365 distinct types and varying instance scales from 30 to 3,767; 3) lengthy referring expressions averaging 24.2 words; and 4) an extensive vocabulary comprising 22,813 unique words. We evaluate a total of 24 large models on Ref-L4 and provide valuable insights. The cleaned versions of RefCOCO, RefCOCO+, and RefCOCOg, as well as our Ref-L4 benchmark and evaluation code, are available at https://github.com/JierunChen/Ref-L4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16866v1-abstract-full').style.display = 'none'; document.getElementById('2406.16866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16557">arXiv:2406.16557</a> <span> [<a href="https://arxiv.org/pdf/2406.16557">pdf</a>, <a href="https://arxiv.org/format/2406.16557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Efficient k-means with Individual Fairness via Exponential Tilting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shengkun Zhu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jinshan Zeng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaodong Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyong Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16557v1-abstract-short" style="display: inline;"> In location-based resource allocation scenarios, the distances between each individual and the facility are desired to be approximately equal, thereby ensuring fairness. Individually fair clustering is often employed to achieve the principle of treating all points equally, which can be applied in these scenarios. This paper proposes a novel algorithm, tilted k-means (TKM), aiming to achieve indivi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16557v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16557v1-abstract-full" style="display: none;"> In location-based resource allocation scenarios, the distances between each individual and the facility are desired to be approximately equal, thereby ensuring fairness. Individually fair clustering is often employed to achieve the principle of treating all points equally, which can be applied in these scenarios. This paper proposes a novel algorithm, tilted k-means (TKM), aiming to achieve individual fairness in clustering. We integrate the exponential tilting into the sum of squared errors (SSE) to formulate a novel objective function called tilted SSE. We demonstrate that the tilted SSE can generalize to SSE and employ the coordinate descent and first-order gradient method for optimization. We propose a novel fairness metric, the variance of the distances within each cluster, which can alleviate the Matthew Effect typically caused by existing fairness metrics. Our theoretical analysis demonstrates that the well-known k-means++ incurs a multiplicative error of O(k log k), and we establish the convergence of TKM under mild conditions. In terms of fairness, we prove that the variance generated by TKM decreases with a scaled hyperparameter. In terms of efficiency, we demonstrate the time complexity is linear with the dataset size. Our experiments demonstrate that TKM outperforms state-of-the-art methods in effectiveness, fairness, and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16557v1-abstract-full').style.display = 'none'; document.getElementById('2406.16557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14880">arXiv:2406.14880</a> <span> [<a href="https://arxiv.org/pdf/2406.14880">pdf</a>, <a href="https://arxiv.org/format/2406.14880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Pathformer: Recursive Path Query Encoding for Complex Logical Query Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chongzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiping Peng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Junhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Linghao Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ruifeng Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14880v1-abstract-short" style="display: inline;"> Complex Logical Query Answering (CLQA) over incomplete knowledge graphs is a challenging task. Recently, Query Embedding (QE) methods are proposed to solve CLQA by performing multi-hop logical reasoning. However, most of them only consider historical query context information while ignoring future information, which leads to their failure to capture the complex dependencies behind the elements of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14880v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14880v1-abstract-full" style="display: none;"> Complex Logical Query Answering (CLQA) over incomplete knowledge graphs is a challenging task. Recently, Query Embedding (QE) methods are proposed to solve CLQA by performing multi-hop logical reasoning. However, most of them only consider historical query context information while ignoring future information, which leads to their failure to capture the complex dependencies behind the elements of a query. In recent years, the transformer architecture has shown a strong ability to model long-range dependencies between words. The bidirectional attention mechanism proposed by the transformer can solve the limitation of these QE methods regarding query context. Still, as a sequence model, it is difficult for the transformer to model complex logical queries with branch structure computation graphs directly. To this end, we propose a neural one-point embedding method called Pathformer based on the tree-like computation graph, i.e., query computation tree. Specifically, Pathformer decomposes the query computation tree into path query sequences by branches and then uses the transformer encoder to recursively encode these path query sequences to obtain the final query embedding. This allows Pathformer to fully utilize future context information to explicitly model the complex interactions between various parts of the path query. Experimental results show that Pathformer outperforms existing competitive neural QE methods, and we found that Pathformer has the potential to be applied to non-one-point embedding space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14880v1-abstract-full').style.display = 'none'; document.getElementById('2406.14880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09386">arXiv:2406.09386</a> <span> [<a href="https://arxiv.org/pdf/2406.09386">pdf</a>, <a href="https://arxiv.org/format/2406.09386">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SimGen: Simulator-conditioned Driving Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunsong Zhou</a>, <a href="/search/cs?searchtype=author&query=Simon%2C+M">Michael Simon</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhenghao Peng</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+S">Sicheng Mo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongzi Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Minyi Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bolei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09386v2-abstract-short" style="display: inline;"> Controllable synthetic data generation can substantially lower the annotation cost of training data. Prior works use diffusion models to generate driving images conditioned on the 3D object layout. However, those models are trained on small-scale datasets like nuScenes, which lack appearance and layout diversity. Moreover, overfitting often happens, where the trained models can only generate image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09386v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09386v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09386v2-abstract-full" style="display: none;"> Controllable synthetic data generation can substantially lower the annotation cost of training data. Prior works use diffusion models to generate driving images conditioned on the 3D object layout. However, those models are trained on small-scale datasets like nuScenes, which lack appearance and layout diversity. Moreover, overfitting often happens, where the trained models can only generate images based on the layout data from the validation set of the same dataset. In this work, we introduce a simulator-conditioned scene generation framework called SimGen that can learn to generate diverse driving scenes by mixing data from the simulator and the real world. It uses a novel cascade diffusion pipeline to address challenging sim-to-real gaps and multi-condition conflicts. A driving video dataset DIVA is collected to enhance the generative diversity of SimGen, which contains over 147.5 hours of real-world driving videos from 73 locations worldwide and simulated driving data from the MetaDrive simulator. SimGen achieves superior generation quality and diversity while preserving controllability based on the text prompt and the layout pulled from a simulator. We further demonstrate the improvements brought by SimGen for synthetic data augmentation on the BEV detection and segmentation task and showcase its capability in safety-critical data generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09386v2-abstract-full').style.display = 'none'; document.getElementById('2406.09386v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08756">arXiv:2406.08756</a> <span> [<a href="https://arxiv.org/pdf/2406.08756">pdf</a>, <a href="https://arxiv.org/format/2406.08756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Large Model Training through Overlapped Activation Recomputation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+P">Ping Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shuibing He</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yingjie Gu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhuwei Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kexin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+X">Xuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weijian Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yi Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhefeng Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yanlong Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08756v2-abstract-short" style="display: inline;"> Large model training has been using recomputation to alleviate the memory pressure and pipelining to exploit the parallelism of data, tensor, and devices. The existing recomputation approaches may incur up to 40% overhead when training real-world models, e.g., the GPT model with 22B parameters. This is because they are executed on demand in the critical training path. In this paper, we design a ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08756v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08756v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08756v2-abstract-full" style="display: none;"> Large model training has been using recomputation to alleviate the memory pressure and pipelining to exploit the parallelism of data, tensor, and devices. The existing recomputation approaches may incur up to 40% overhead when training real-world models, e.g., the GPT model with 22B parameters. This is because they are executed on demand in the critical training path. In this paper, we design a new recomputation framework, Lynx, to reduce the overhead by overlapping the recomputation with communication occurring in training pipelines. It consists of an optimal scheduling algorithm (OPT) and a heuristic-based scheduling algorithm (HEU). OPT achieves a global optimum but suffers from a long search time. HEU was designed based on our observation that there are identical structures in large DNN models so that we can apply the same scheduling policy to all identical structures. HEU achieves a local optimum but reduces the search time by 99% compared to OPT. Our comprehensive evaluation using GPT models with 1.3B-20B parameters shows that both OPT and HEU outperform the state-of-the-art recomputation approaches (e.g., Megatron-LM and Checkmake) by 1.02-1.53x. HEU achieves a similar performance as OPT with a search time of 0.16s on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08756v2-abstract-full').style.display = 'none'; document.getElementById('2406.08756v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07539">arXiv:2406.07539</a> <span> [<a href="https://arxiv.org/pdf/2406.07539">pdf</a>, <a href="https://arxiv.org/format/2406.07539">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> BAKU: An Efficient Transformer for Multi-Task Policy Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haldar%2C+S">Siddhant Haldar</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhuoran Peng</a>, <a href="/search/cs?searchtype=author&query=Pinto%2C+L">Lerrel Pinto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07539v2-abstract-short" style="display: inline;"> Training generalist agents capable of solving diverse tasks is challenging, often requiring large datasets of expert demonstrations. This is particularly problematic in robotics, where each data point requires physical execution of actions in the real world. Thus, there is a pressing need for architectures that can effectively leverage the available training data. In this work, we present BAKU, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07539v2-abstract-full').style.display = 'inline'; document.getElementById('2406.07539v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07539v2-abstract-full" style="display: none;"> Training generalist agents capable of solving diverse tasks is challenging, often requiring large datasets of expert demonstrations. This is particularly problematic in robotics, where each data point requires physical execution of actions in the real world. Thus, there is a pressing need for architectures that can effectively leverage the available training data. In this work, we present BAKU, a simple transformer architecture that enables efficient learning of multi-task robot policies. BAKU builds upon recent advancements in offline imitation learning and meticulously combines observation trunks, action chunking, multi-sensory observations, and action heads to substantially improve upon prior work. Our experiments on 129 simulated tasks across LIBERO, Meta-World suite, and the Deepmind Control suite exhibit an overall 18% absolute improvement over RT-1 and MT-ACT, with a 36% improvement on the harder LIBERO benchmark. On 30 real-world manipulation tasks, given an average of just 17 demonstrations per task, BAKU achieves a 91% success rate. Videos of the robot are best viewed at https://baku-robot.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07539v2-abstract-full').style.display = 'none'; document.getElementById('2406.07539v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04035">arXiv:2406.04035</a> <span> [<a href="https://arxiv.org/pdf/2406.04035">pdf</a>, <a href="https://arxiv.org/format/2406.04035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> STEMO: Early Spatio-temporal Forecasting with Multi-Objective Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wei Shao</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yufan Kang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Ziyan Peng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiao Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuhui Yang</a>, <a href="/search/cs?searchtype=author&query=Salim%2C+F+D">Flora D Salim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04035v3-abstract-short" style="display: inline;"> Accuracy and timeliness are indeed often conflicting goals in prediction tasks. Premature predictions may yield a higher rate of false alarms, whereas delaying predictions to gather more information can render them too late to be useful. In applications such as wildfires, crimes, and traffic jams, timely forecasting are vital for safeguarding human life and property. Consequently, finding a balanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04035v3-abstract-full').style.display = 'inline'; document.getElementById('2406.04035v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04035v3-abstract-full" style="display: none;"> Accuracy and timeliness are indeed often conflicting goals in prediction tasks. Premature predictions may yield a higher rate of false alarms, whereas delaying predictions to gather more information can render them too late to be useful. In applications such as wildfires, crimes, and traffic jams, timely forecasting are vital for safeguarding human life and property. Consequently, finding a balance between accuracy and timeliness is crucial. In this paper, we propose an early spatio-temporal forecasting model based on Multi-Objective reinforcement learning that can either implement an optimal policy given a preference or infer the preference based on a small number of samples. The model addresses two primary challenges: 1) enhancing the accuracy of early forecasting and 2) providing the optimal policy for determining the most suitable prediction time for each area. Our method demonstrates superior performance on three large-scale real-world datasets, surpassing existing methods in early spatio-temporal forecasting tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04035v3-abstract-full').style.display = 'none'; document.getElementById('2406.04035v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted paper in KDD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20654">arXiv:2405.20654</a> <span> [<a href="https://arxiv.org/pdf/2405.20654">pdf</a>, <a href="https://arxiv.org/format/2405.20654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Passage-specific Prompt Tuning for Passage Reranking in Question Answering with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xuyang Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhiyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Sai%2C+K+S+R">Krishna Sravanthi Rajanala Sai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hsin-Tai Wu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yi Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20654v2-abstract-short" style="display: inline;"> Effective passage retrieval and reranking methods have been widely utilized to identify suitable candidates in open-domain question answering tasks, recent studies have resorted to LLMs for reranking the retrieved passages by the log-likelihood of the question conditioned on each passage. Although these methods have demonstrated promising results, the performance is notably sensitive to the human-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20654v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20654v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20654v2-abstract-full" style="display: none;"> Effective passage retrieval and reranking methods have been widely utilized to identify suitable candidates in open-domain question answering tasks, recent studies have resorted to LLMs for reranking the retrieved passages by the log-likelihood of the question conditioned on each passage. Although these methods have demonstrated promising results, the performance is notably sensitive to the human-written prompt (or hard prompt), and fine-tuning LLMs can be computationally intensive and time-consuming. Furthermore, this approach limits the leverage of question-passage relevance pairs and passage-specific knowledge to enhance the ranking capabilities of LLMs. In this paper, we propose passage-specific prompt tuning for reranking in open-domain question answering (PSPT): a parameter-efficient method that fine-tunes learnable passage-specific soft prompts, incorporating passage-specific knowledge from a limited set of question-passage relevance pairs. The method involves ranking retrieved passages based on the log-likelihood of the model generating the question conditioned on each passage and the learned soft prompt. We conducted extensive experiments utilizing the Llama-2-chat-7B model across three publicly available open-domain question answering datasets and the results demonstrate the effectiveness of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20654v2-abstract-full').style.display = 'none'; document.getElementById('2405.20654v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Gen-IR@SIGIR24</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+Z&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository