Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 240 results for author: <span class="mathjax">Du, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Du%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Du, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Du%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Du, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Du%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16726">arXiv:2410.16726</a> <span> [<a href="https://arxiv.org/pdf/2410.16726">pdf</a>, <a href="https://arxiv.org/format/2410.16726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low-Resource ASR through Versatile TTS: Bridging the Data Gap </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanrou Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16726v1-abstract-short" style="display: inline;"> While automatic speech recognition (ASR) systems have achieved remarkable performance with large-scale datasets, their efficacy remains inadequate in low-resource settings, encompassing dialects, accents, minority languages, and long-tail hotwords, domains with significant practical relevance. With the advent of versatile and powerful text-to-speech (TTS) models, capable of generating speech with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16726v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16726v1-abstract-full" style="display: none;"> While automatic speech recognition (ASR) systems have achieved remarkable performance with large-scale datasets, their efficacy remains inadequate in low-resource settings, encompassing dialects, accents, minority languages, and long-tail hotwords, domains with significant practical relevance. With the advent of versatile and powerful text-to-speech (TTS) models, capable of generating speech with human-level naturalness, expressiveness, and diverse speaker profiles, leveraging TTS for ASR data augmentation provides a cost-effective and practical approach to enhancing ASR performance. Comprehensive experiments on an unprecedentedly rich variety of low-resource datasets demonstrate consistent and substantial performance improvements, proving that the proposed method of enhancing low-resource ASR through a versatile TTS model is highly effective and has broad application prospects. Furthermore, we delve deeper into key characteristics of synthesized speech data that contribute to ASR improvement, examining factors such as text diversity, speaker diversity, and the volume of synthesized data, with text diversity being studied for the first time in this work. We hope our findings provide helpful guidance and reference for the practical application of TTS-based data augmentation and push the advancement of low-resource ASR one step further. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16726v1-abstract-full').style.display = 'none'; document.getElementById('2410.16726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14152">arXiv:2410.14152</a> <span> [<a href="https://arxiv.org/pdf/2410.14152">pdf</a>, <a href="https://arxiv.org/format/2410.14152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SRAP-Agent: Simulating and Optimizing Scarce Resource Allocation Policy with LLM-based Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiarui Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongtao Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhicheng Du</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhewei Wei</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weiran Shen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14152v1-abstract-short" style="display: inline;"> Public scarce resource allocation plays a crucial role in economics as it directly influences the efficiency and equity in society. Traditional studies including theoretical model-based, empirical study-based and simulation-based methods encounter limitations due to the idealized assumption of complete information and individual rationality, as well as constraints posed by limited available data.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14152v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14152v1-abstract-full" style="display: none;"> Public scarce resource allocation plays a crucial role in economics as it directly influences the efficiency and equity in society. Traditional studies including theoretical model-based, empirical study-based and simulation-based methods encounter limitations due to the idealized assumption of complete information and individual rationality, as well as constraints posed by limited available data. In this work, we propose an innovative framework, SRAP-Agent (Simulating and Optimizing Scarce Resource Allocation Policy with LLM-based Agent), which integrates Large Language Models (LLMs) into economic simulations, aiming to bridge the gap between theoretical models and real-world dynamics. Using public housing allocation scenarios as a case study, we conduct extensive policy simulation experiments to verify the feasibility and effectiveness of the SRAP-Agent and employ the Policy Optimization Algorithm with certain optimization objectives. The source code can be found in https://github.com/jijiarui-cather/SRAPAgent_Framework <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14152v1-abstract-full').style.display = 'none'; document.getElementById('2410.14152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12376">arXiv:2410.12376</a> <span> [<a href="https://arxiv.org/pdf/2410.12376">pdf</a>, <a href="https://arxiv.org/format/2410.12376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ShapefileGPT: A Multi-Agent Large Language Model Framework for Automated Shapefile Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingming Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaxia Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sensen Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+K">Kai Fang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hailin Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhong Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liuchang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12376v2-abstract-short" style="display: inline;"> Vector data is one of the two core data structures in geographic information science (GIS), essential for accurately storing and representing geospatial information. Shapefile, the most widely used vector data format, has become the industry standard supported by all major geographic information systems. However, processing this data typically requires specialized GIS knowledge and skills, creatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12376v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12376v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12376v2-abstract-full" style="display: none;"> Vector data is one of the two core data structures in geographic information science (GIS), essential for accurately storing and representing geospatial information. Shapefile, the most widely used vector data format, has become the industry standard supported by all major geographic information systems. However, processing this data typically requires specialized GIS knowledge and skills, creating a barrier for researchers from other fields and impeding interdisciplinary research in spatial data analysis. Moreover, while large language models (LLMs) have made significant advancements in natural language processing and task automation, they still face challenges in handling the complex spatial and topological relationships inherent in GIS vector data. To address these challenges, we propose ShapefileGPT, an innovative framework powered by LLMs, specifically designed to automate Shapefile tasks. ShapefileGPT utilizes a multi-agent architecture, in which the planner agent is responsible for task decomposition and supervision, while the worker agent executes the tasks. We developed a specialized function library for handling Shapefiles and provided comprehensive API documentation, enabling the worker agent to operate Shapefiles efficiently through function calling. For evaluation, we developed a benchmark dataset based on authoritative textbooks, encompassing tasks in categories such as geometric operations and spatial queries. ShapefileGPT achieved a task success rate of 95.24%, outperforming the GPT series models. In comparison to traditional LLMs, ShapefileGPT effectively handles complex vector data analysis tasks, overcoming the limitations of traditional LLMs in spatial analysis. This breakthrough opens new pathways for advancing automation and intelligence in the GIS field, with significant potential in interdisciplinary data analysis and application contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12376v2-abstract-full').style.display = 'none'; document.getElementById('2410.12376v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10865">arXiv:2410.10865</a> <span> [<a href="https://arxiv.org/pdf/2410.10865">pdf</a>, <a href="https://arxiv.org/format/2410.10865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generating Synthetic Datasets for Few-shot Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xu Guo</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zilin Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyang Li</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+C">Chunyan Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10865v1-abstract-short" style="display: inline;"> A major limitation of prompt tuning is its dependence on large labeled training datasets. Under few-shot learning settings, prompt tuning lags far behind full-model fine-tuning, limiting its scope of application. In this paper, we leverage the powerful LLMs to synthesize task-specific labeled data for training the soft prompts. We first introduce a distribution-aligned weighted generator tuning (D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10865v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10865v1-abstract-full" style="display: none;"> A major limitation of prompt tuning is its dependence on large labeled training datasets. Under few-shot learning settings, prompt tuning lags far behind full-model fine-tuning, limiting its scope of application. In this paper, we leverage the powerful LLMs to synthesize task-specific labeled data for training the soft prompts. We first introduce a distribution-aligned weighted generator tuning (DawGen) method to encourage generating in-distribution data that aligns with the few-shot real data. Then, we train soft prompts on both synthetic and real datasets using a gradient surgery approach, which eliminates the conflicting gradients from different data sources. Experiments on seven sentence-pair classification datasets demonstrate the effectiveness of our proposed method for boosting prompt tuning in few-shot learning settings. Results on QQP, MRPC, and SICK datasets are even comparable to the performance of transfer learning from large real-world datasets, showing the promise of synthetic data as an alternative for enhancing soft prompt tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10865v1-abstract-full').style.display = 'none'; document.getElementById('2410.10865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08035">arXiv:2410.08035</a> <span> [<a href="https://arxiv.org/pdf/2410.08035">pdf</a>, <a href="https://arxiv.org/format/2410.08035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IntrinsicVoice: Empowering LLMs with Intrinsic Real-time Voice Interaction Abilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaohong Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08035v2-abstract-short" style="display: inline;"> Current methods of building LLMs with voice interaction capabilities rely heavily on explicit text autoregressive generation before or during speech response generation to maintain content quality, which unfortunately brings computational overhead and increases latency in multi-turn interactions. To address this, we introduce IntrinsicVoic,e an LLM designed with intrinsic real-time voice interacti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08035v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08035v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08035v2-abstract-full" style="display: none;"> Current methods of building LLMs with voice interaction capabilities rely heavily on explicit text autoregressive generation before or during speech response generation to maintain content quality, which unfortunately brings computational overhead and increases latency in multi-turn interactions. To address this, we introduce IntrinsicVoic,e an LLM designed with intrinsic real-time voice interaction capabilities. IntrinsicVoice aims to facilitate the transfer of textual capabilities of pre-trained LLMs to the speech modality by mitigating the modality gap between text and speech. Our novelty architecture, GroupFormer, can reduce speech sequences to lengths comparable to text sequences while generating high-quality audio, significantly reducing the length difference between speech and text, speeding up inference, and alleviating long-text modeling issues. Additionally, we construct a multi-turn speech-to-speech dialogue dataset named \method-500k which includes nearly 500k turns of speech-to-speech dialogues, and a cross-modality training strategy to enhance the semantic alignment between speech and text. Experimental results demonstrate that IntrinsicVoice can generate high-quality speech response with latency lower than 100ms in multi-turn dialogue scenarios. Demos are available at https://instrinsicvoice.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08035v2-abstract-full').style.display = 'none'; document.getElementById('2410.08035v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07265">arXiv:2410.07265</a> <span> [<a href="https://arxiv.org/pdf/2410.07265">pdf</a>, <a href="https://arxiv.org/format/2410.07265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Survey: Collaborative Hardware and Software Design in the Era of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+C">Cong Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+F">Feng Cheng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhixu Du</a>, <a href="/search/cs?searchtype=author&query=Kiessling%2C+J">James Kiessling</a>, <a href="/search/cs?searchtype=author&query=Ku%2C+J">Jonathan Ku</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziru Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mingyuan Ma</a>, <a href="/search/cs?searchtype=author&query=Molom-Ochir%2C+T">Tergel Molom-Ochir</a>, <a href="/search/cs?searchtype=author&query=Morris%2C+B">Benjamin Morris</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+H">Haoxuan Shan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingwei Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yitu Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chiyue Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xueying Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhao Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H+F">Hao Frank Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qilin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guanglei Zhou</a>, <a href="/search/cs?searchtype=author&query=Hai"> Hai</a>, <a href="/search/cs?searchtype=author&query=Li"> Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07265v1-abstract-short" style="display: inline;"> The rapid development of large language models (LLMs) has significantly transformed the field of artificial intelligence, demonstrating remarkable capabilities in natural language processing and moving towards multi-modal functionality. These models are increasingly integrated into diverse applications, impacting both research and industry. However, their development and deployment present substan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07265v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07265v1-abstract-full" style="display: none;"> The rapid development of large language models (LLMs) has significantly transformed the field of artificial intelligence, demonstrating remarkable capabilities in natural language processing and moving towards multi-modal functionality. These models are increasingly integrated into diverse applications, impacting both research and industry. However, their development and deployment present substantial challenges, including the need for extensive computational resources, high energy consumption, and complex software optimizations. Unlike traditional deep learning systems, LLMs require unique optimization strategies for training and inference, focusing on system-level efficiency. This paper surveys hardware and software co-design approaches specifically tailored to address the unique characteristics and constraints of large language models. This survey analyzes the challenges and impacts of LLMs on hardware and algorithm research, exploring algorithm optimization, hardware design, and system-level innovations. It aims to provide a comprehensive understanding of the trade-offs and considerations in LLM-centric computing systems, guiding future advancements in AI. Finally, we summarize the existing efforts in this space and outline future directions toward realizing production-grade co-design methodologies for the next generation of large language models and AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07265v1-abstract-full').style.display = 'none'; document.getElementById('2410.07265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Circuits and Systems Magazine</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17049">arXiv:2409.17049</a> <span> [<a href="https://arxiv.org/pdf/2409.17049">pdf</a>, <a href="https://arxiv.org/format/2409.17049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ControlCity: A Multimodal Diffusion Model Based Approach for Accurate Geospatial Data Generation and Urban Morphology Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fangshuo Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaxia Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rui Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sensen Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hailin Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhong Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liuchang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17049v1-abstract-short" style="display: inline;"> Volunteer Geographic Information (VGI), with its rich variety, large volume, rapid updates, and diverse sources, has become a critical source of geospatial data. However, VGI data from platforms like OSM exhibit significant quality heterogeneity across different data types, particularly with urban building data. To address this, we propose a multi-source geographic data transformation solution, ut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17049v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17049v1-abstract-full" style="display: none;"> Volunteer Geographic Information (VGI), with its rich variety, large volume, rapid updates, and diverse sources, has become a critical source of geospatial data. However, VGI data from platforms like OSM exhibit significant quality heterogeneity across different data types, particularly with urban building data. To address this, we propose a multi-source geographic data transformation solution, utilizing accessible and complete VGI data to assist in generating urban building footprint data. We also employ a multimodal data generation framework to improve accuracy. First, we introduce a pipeline for constructing an 'image-text-metadata-building footprint' dataset, primarily based on road network data and supplemented by other multimodal data. We then present ControlCity, a geographic data transformation method based on a multimodal diffusion model. This method first uses a pre-trained text-to-image model to align text, metadata, and building footprint data. An improved ControlNet further integrates road network and land-use imagery, producing refined building footprint data. Experiments across 22 global cities demonstrate that ControlCity successfully simulates real urban building patterns, achieving state-of-the-art performance. Specifically, our method achieves an average FID score of 50.94, reducing error by 71.01% compared to leading methods, and a MIoU score of 0.36, an improvement of 38.46%. Additionally, our model excels in tasks like urban morphology transfer, zero-shot city generation, and spatial data completeness assessment. In the zero-shot city task, our method accurately predicts and generates similar urban structures, demonstrating strong generalization. This study confirms the effectiveness of our approach in generating urban building footprint data and capturing complex city characteristics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17049v1-abstract-full').style.display = 'none'; document.getElementById('2409.17049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15654">arXiv:2409.15654</a> <span> [<a href="https://arxiv.org/pdf/2409.15654">pdf</a>, <a href="https://arxiv.org/format/2409.15654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Cambricon-LLM: A Chiplet-Based Hybrid Architecture for On-Device Inference of 70B LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhongkai Yu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shengwen Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianyun Ma</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yunke Cai</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Z">Ziyuan Nan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinkai Song</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yifan Hao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+T">Tian Zhi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yongwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianshi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15654v1-abstract-short" style="display: inline;"> Deploying advanced large language models on edge devices, such as smartphones and robotics, is a growing trend that enhances user data privacy and network connectivity resilience while preserving intelligent capabilities. However, such a task exhibits single-batch computing with incredibly low arithmetic intensity, which poses the significant challenges of huge memory footprint and bandwidth deman… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15654v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15654v1-abstract-full" style="display: none;"> Deploying advanced large language models on edge devices, such as smartphones and robotics, is a growing trend that enhances user data privacy and network connectivity resilience while preserving intelligent capabilities. However, such a task exhibits single-batch computing with incredibly low arithmetic intensity, which poses the significant challenges of huge memory footprint and bandwidth demands on limited edge resources. To address these issues, we introduce Cambricon-LLM, a chiplet-based hybrid architecture with NPU and a dedicated NAND flash chip to enable efficient on-device inference of 70B LLMs. Such a hybrid architecture utilizes both the high computing capability of NPU and the data capacity of the NAND flash chip, with the proposed hardware-tiling strategy that minimizes the data movement overhead between NPU and NAND flash chip. Specifically, the NAND flash chip, enhanced by our innovative in-flash computing and on-die ECC techniques, excels at performing precise lightweight on-die processing. Simultaneously, the NPU collaborates with the flash chip for matrix operations and handles special function computations beyond the flash's on-die processing capabilities. Overall, Cambricon-LLM enables the on-device inference of 70B LLMs at a speed of 3.44 token/s, and 7B LLMs at a speed of 36.34 token/s, which is over 22X to 45X faster than existing flash-offloading technologies, showing the potentiality of deploying powerful LLMs in edge devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15654v1-abstract-full').style.display = 'none'; document.getElementById('2409.15654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 16 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> MICRO 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14019">arXiv:2409.14019</a> <span> [<a href="https://arxiv.org/pdf/2409.14019">pdf</a>, <a href="https://arxiv.org/format/2409.14019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2024.3466077">10.1109/LRA.2024.3466077 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MOSE: Monocular Semantic Reconstruction Using NeRF-Lifted Noisy Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhua Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Binbin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+K">Kai Huo</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+S">Shuaifeng Zhi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14019v1-abstract-short" style="display: inline;"> Accurately reconstructing dense and semantically annotated 3D meshes from monocular images remains a challenging task due to the lack of geometry guidance and imperfect view-dependent 2D priors. Though we have witnessed recent advancements in implicit neural scene representations enabling precise 2D rendering simply from multi-view images, there have been few works addressing 3D scene understandin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14019v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14019v1-abstract-full" style="display: none;"> Accurately reconstructing dense and semantically annotated 3D meshes from monocular images remains a challenging task due to the lack of geometry guidance and imperfect view-dependent 2D priors. Though we have witnessed recent advancements in implicit neural scene representations enabling precise 2D rendering simply from multi-view images, there have been few works addressing 3D scene understanding with monocular priors alone. In this paper, we propose MOSE, a neural field semantic reconstruction approach to lift inferred image-level noisy priors to 3D, producing accurate semantics and geometry in both 3D and 2D space. The key motivation for our method is to leverage generic class-agnostic segment masks as guidance to promote local consistency of rendered semantics during training. With the help of semantics, we further apply a smoothness regularization to texture-less regions for better geometric quality, thus achieving mutual benefits of geometry and semantics. Experiments on the ScanNet dataset show that our MOSE outperforms relevant baselines across all metrics on tasks of 3D semantic segmentation, 2D semantic segmentation and 3D surface reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14019v1-abstract-full').style.display = 'none'; document.getElementById('2409.14019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13730">arXiv:2409.13730</a> <span> [<a href="https://arxiv.org/pdf/2409.13730">pdf</a>, <a href="https://arxiv.org/format/2409.13730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VisScience: An Extensive Benchmark for Evaluating K12 Educational Multi-modal Scientific Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinhao Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhengxiao Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weihan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13730v1-abstract-short" style="display: inline;"> Multi-modal large language models (MLLMs) have demonstrated promising capabilities across various tasks by integrating textual and visual information to achieve visual understanding in complex scenarios. Despite the availability of several benchmarks aims to evaluating MLLMs in tasks from visual question answering to complex problem-solving, most focus predominantly on mathematics or general visua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13730v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13730v1-abstract-full" style="display: none;"> Multi-modal large language models (MLLMs) have demonstrated promising capabilities across various tasks by integrating textual and visual information to achieve visual understanding in complex scenarios. Despite the availability of several benchmarks aims to evaluating MLLMs in tasks from visual question answering to complex problem-solving, most focus predominantly on mathematics or general visual understanding tasks. This reveals a critical gap in current benchmarks, which often overlook the inclusion of other key scientific disciplines such as physics and chemistry. To address this gap, we meticulously construct a comprehensive benchmark, named VisScience, which is utilized to assess the multi-modal scientific reasoning across the three disciplines of mathematics, physics, and chemistry. This benchmark comprises 3,000 questions drawn from K12 education - spanning elementary school through high school - equally distributed across three disciplines, with 1,000 questions per discipline. The questions within VisScience span 21 distinct subjects and are categorized into five difficulty levels, offering a broad spectrum of topics within each discipline. With VisScience, we present a detailed evaluation of the performance of 25 representative MLLMs in scientific reasoning. Experimental results demonstrate that closed-source MLLMs generally outperform open-source models. The best performance observed include a 53.4\% accuracy in mathematics by Claude3.5-Sonnet, 38.2\% in physics by GPT-4o, and 47.0\% in chemistry by Gemini-1.5-Pro. These results underscore the strengths and limitations of MLLMs, suggesting areas for future improvement and highlighting the importance of developing models that can effectively handle the diverse demands of multi-modal scientific reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13730v1-abstract-full').style.display = 'none'; document.getElementById('2409.13730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">89 pages, 70 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13729">arXiv:2409.13729</a> <span> [<a href="https://arxiv.org/pdf/2409.13729">pdf</a>, <a href="https://arxiv.org/format/2409.13729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MathGLM-Vision: Solving Mathematical Problems with Multi-Modal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinhao Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhengxiao Du</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenmeng Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weihan Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+W">Wenyi Hong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13729v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated significant capabilities in mathematical reasoning, particularly with text-based mathematical problems. However, current multi-modal large language models (MLLMs), especially those specialized in mathematics, tend to focus predominantly on solving geometric problems but ignore the diversity of visual information available in other areas of mathematics… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13729v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13729v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated significant capabilities in mathematical reasoning, particularly with text-based mathematical problems. However, current multi-modal large language models (MLLMs), especially those specialized in mathematics, tend to focus predominantly on solving geometric problems but ignore the diversity of visual information available in other areas of mathematics. Moreover, the geometric information for these specialized mathematical MLLMs is derived from several public datasets, which are typically limited in diversity and complexity. To address these limitations, we aim to construct a fine-tuning dataset named MathVL, and develop a series of specialized mathematical MLLMs termed MathGLM-Vision by conducting Supervised Fine-Tuning (SFT) on MathVL with various parameter-scale backbones. To extensively evaluate the effectiveness of MathGLM-Vision, we conduct experiments on several public benchmarks and our curated MathVL-test consisting of 2,000 problems. Experimental results demonstrate that MathGLM-Vision achieves significant improvements compared with some existing models, including backbone models and open-source mathematical MLLMs. These findings indicate the importance of diversity dataset in enhancing the mathematical reasoning abilities of MLLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13729v1-abstract-full').style.display = 'none'; document.getElementById('2409.13729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages,19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14438">arXiv:2408.14438</a> <span> [<a href="https://arxiv.org/pdf/2408.14438">pdf</a>, <a href="https://arxiv.org/format/2408.14438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Large Language Models on Spatial Tasks: A Multi-Task Benchmarking Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liuchang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qingming Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Luyao Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Q">Qianqian Luo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sensen Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xinyue Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hailin Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhong Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14438v3-abstract-short" style="display: inline;"> The advent of large language models such as ChatGPT, Gemini, and others has underscored the importance of evaluating their diverse capabilities, ranging from natural language understanding to code generation. However, their performance on spatial tasks has not been comprehensively assessed. This study addresses this gap by introducing a novel multi-task spatial evaluation dataset, designed to syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14438v3-abstract-full').style.display = 'inline'; document.getElementById('2408.14438v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14438v3-abstract-full" style="display: none;"> The advent of large language models such as ChatGPT, Gemini, and others has underscored the importance of evaluating their diverse capabilities, ranging from natural language understanding to code generation. However, their performance on spatial tasks has not been comprehensively assessed. This study addresses this gap by introducing a novel multi-task spatial evaluation dataset, designed to systematically explore and compare the performance of several advanced models on spatial tasks. The dataset encompasses twelve distinct task types, including spatial understanding and path planning, each with verified, accurate answers. We evaluated multiple models, including OpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase testing approach. Initially, we conducted zero-shot testing, followed by categorizing the dataset by difficulty and performing prompt tuning tests. Results indicate that gpt-4o achieved the highest overall accuracy in the first phase, with an average of 71.3%. Although moonshot-v1-8k slightly underperformed overall, it surpassed gpt-4o in place name recognition tasks. The study also highlights the impact of prompt strategies on model performance in specific tasks. For example, the Chain-of-Thought (COT) strategy increased gpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot strategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to 76.3%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14438v3-abstract-full').style.display = 'none'; document.getElementById('2408.14438v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06577">arXiv:2408.06577</a> <span> [<a href="https://arxiv.org/pdf/2408.06577">pdf</a>, <a href="https://arxiv.org/format/2408.06577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Prompt Tuning as User Inherent Profile Inference Machine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yusheng Lu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhaocheng Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichao Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huifeng Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhenhua Dong</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yongrui Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06577v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have exhibited significant promise in recommender systems by empowering user profiles with their extensive world knowledge and superior reasoning capabilities. However, LLMs face challenges like unstable instruction compliance, modality gaps, and high inference latency, leading to textual noise and limiting their effectiveness in recommender systems. To address these c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06577v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06577v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have exhibited significant promise in recommender systems by empowering user profiles with their extensive world knowledge and superior reasoning capabilities. However, LLMs face challenges like unstable instruction compliance, modality gaps, and high inference latency, leading to textual noise and limiting their effectiveness in recommender systems. To address these challenges, we propose UserIP-Tuning, which uses prompt-tuning to infer user profiles. It integrates the causal relationship between user profiles and behavior sequences into LLMs' prompts. And employs expectation maximization to infer the embedded latent profile, minimizing textual noise by fixing the prompt template. Furthermore, A profile quantization codebook bridges the modality gap by categorizing profile embeddings into collaborative IDs, which are pre-stored for online deployment. This improves time efficiency and reduces memory usage. Experiments on four public datasets show that UserIP-Tuning outperforms state-of-the-art recommendation algorithms. Additional tests and case studies confirm its effectiveness, robustness, and transferability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06577v1-abstract-full').style.display = 'none'; document.getElementById('2408.06577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06327">arXiv:2408.06327</a> <span> [<a href="https://arxiv.org/pdf/2408.06327">pdf</a>, <a href="https://arxiv.org/format/2408.06327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VisualAgentBench: Towards Large Multimodal Models as Visual Foundation Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yu Gu</a>, <a href="/search/cs?searchtype=author&query=Iong%2C+I+L">Iat Long Iong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xixuan Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shudan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanyu Lai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiadai Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyue Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zehan Qi</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+S">Shuntian Yao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xueqiao Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Siyi Cheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qinkai Zheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+W">Wenyi Hong</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Lihang Pan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaotao Gu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Aohan Zeng</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06327v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have ushered in a new era in artificial intelligence, merging capabilities in both language and vision to form highly capable Visual Foundation Agents. These agents are postulated to excel across a myriad of tasks, potentially approaching general artificial intelligence. However, existing benchmarks fail to sufficiently challenge or showcase the full potential of LMM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06327v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06327v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have ushered in a new era in artificial intelligence, merging capabilities in both language and vision to form highly capable Visual Foundation Agents. These agents are postulated to excel across a myriad of tasks, potentially approaching general artificial intelligence. However, existing benchmarks fail to sufficiently challenge or showcase the full potential of LMMs in complex, real-world environments. To address this gap, we introduce VisualAgentBench (VAB), a comprehensive and pioneering benchmark specifically designed to train and evaluate LMMs as visual foundation agents across diverse scenarios, including Embodied, Graphical User Interface, and Visual Design, with tasks formulated to probe the depth of LMMs' understanding and interaction capabilities. Through rigorous testing across nine proprietary LMM APIs and eight open models, we demonstrate the considerable yet still developing agent capabilities of these models. Additionally, VAB constructs a trajectory training set constructed through hybrid methods including Program-based Solvers, LMM Agent Bootstrapping, and Human Demonstrations, promoting substantial performance improvements in LMMs through behavior cloning. Our work not only aims to benchmark existing models but also provides a solid foundation for future development into visual foundation agents. Code, train \& test data, and part of fine-tuned open LMMs are available at \url{https://github.com/THUDM/VisualAgentBench}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06327v1-abstract-full').style.display = 'none'; document.getElementById('2408.06327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00418">arXiv:2408.00418</a> <span> [<a href="https://arxiv.org/pdf/2408.00418">pdf</a>, <a href="https://arxiv.org/format/2408.00418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Reliable Advertising Image Generation Using Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenbang Du</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haohan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingsen Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jingjing Lv</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xin Zhu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Junsheng Jin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junjie Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangang Lin</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jingping Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00418v1-abstract-short" style="display: inline;"> In the e-commerce realm, compelling advertising images are pivotal for attracting customer attention. While generative models automate image generation, they often produce substandard images that may mislead customers and require significant labor costs to inspect. This paper delves into increasing the rate of available generated images. We first introduce a multi-modal Reliable Feedback Network (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00418v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00418v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00418v1-abstract-full" style="display: none;"> In the e-commerce realm, compelling advertising images are pivotal for attracting customer attention. While generative models automate image generation, they often produce substandard images that may mislead customers and require significant labor costs to inspect. This paper delves into increasing the rate of available generated images. We first introduce a multi-modal Reliable Feedback Network (RFNet) to automatically inspect the generated images. Combining the RFNet into a recurrent process, Recurrent Generation, results in a higher number of available advertising images. To further enhance production efficiency, we fine-tune diffusion models with an innovative Consistent Condition regularization utilizing the feedback from RFNet (RFFT). This results in a remarkable increase in the available rate of generated images, reducing the number of attempts in Recurrent Generation, and providing a highly efficient production process without sacrificing visual appeal. We also construct a Reliable Feedback 1 Million (RF1M) dataset which comprises over one million generated advertising images annotated by human, which helps to train RFNet to accurately assess the availability of generated images and faithfully reflect the human feedback. Generally speaking, our approach offers a reliable solution for advertising image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00418v1-abstract-full').style.display = 'none'; document.getElementById('2408.00418v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19696">arXiv:2407.19696</a> <span> [<a href="https://arxiv.org/pdf/2407.19696">pdf</a>, <a href="https://arxiv.org/format/2407.19696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Layer Feature Pyramid Transformer for Small Object Detection in Aerial Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zewen Du</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhenjiang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Ying Jin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hongbin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19696v1-abstract-short" style="display: inline;"> Object detection in aerial images has always been a challenging task due to the generally small size of the objects. Most current detectors prioritize novel detection frameworks, often overlooking research on fundamental components such as feature pyramid networks. In this paper, we introduce the Cross-Layer Feature Pyramid Transformer (CFPT), a novel upsampler-free feature pyramid network designe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19696v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19696v1-abstract-full" style="display: none;"> Object detection in aerial images has always been a challenging task due to the generally small size of the objects. Most current detectors prioritize novel detection frameworks, often overlooking research on fundamental components such as feature pyramid networks. In this paper, we introduce the Cross-Layer Feature Pyramid Transformer (CFPT), a novel upsampler-free feature pyramid network designed specifically for small object detection in aerial images. CFPT incorporates two meticulously designed attention blocks with linear computational complexity: the Cross-Layer Channel-Wise Attention (CCA) and the Cross-Layer Spatial-Wise Attention (CSA). CCA achieves cross-layer interaction by dividing channel-wise token groups to perceive cross-layer global information along the spatial dimension, while CSA completes cross-layer interaction by dividing spatial-wise token groups to perceive cross-layer global information along the channel dimension. By integrating these modules, CFPT enables cross-layer interaction in one step, thereby avoiding the semantic gap and information loss associated with element-wise summation and layer-by-layer transmission. Furthermore, CFPT incorporates global contextual information, which enhances detection performance for small objects. To further enhance location awareness during cross-layer interaction, we propose the Cross-Layer Consistent Relative Positional Encoding (CCPE) based on inter-layer mutual receptive fields. We evaluate the effectiveness of CFPT on two challenging object detection datasets in aerial images, namely VisDrone2019-DET and TinyPerson. Extensive experiments demonstrate the effectiveness of CFPT, which outperforms state-of-the-art feature pyramid networks while incurring lower computational costs. The code will be released at https://github.com/duzw9311/CFPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19696v1-abstract-full').style.display = 'none'; document.getElementById('2407.19696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10424">arXiv:2407.10424</a> <span> [<a href="https://arxiv.org/pdf/2407.10424">pdf</a>, <a href="https://arxiv.org/format/2407.10424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CodeV: Empowering LLMs for Verilog Generation through Multi-Level Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongxiao Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengwei Jin</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Z">Ziyuan Nan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianyun Ma</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lei Qi</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yansong Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10424v4-abstract-short" style="display: inline;"> The increasing complexity and high costs associated with modern processor design have led to a surge in demand for processor design automation. Instruction-tuned large language models (LLMs) have demonstrated remarkable performance in automatically generating code for general-purpose programming languages like Python. However, these methods fail on hardware description languages (HDLs) like Verilo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10424v4-abstract-full').style.display = 'inline'; document.getElementById('2407.10424v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10424v4-abstract-full" style="display: none;"> The increasing complexity and high costs associated with modern processor design have led to a surge in demand for processor design automation. Instruction-tuned large language models (LLMs) have demonstrated remarkable performance in automatically generating code for general-purpose programming languages like Python. However, these methods fail on hardware description languages (HDLs) like Verilog due to the scarcity of high-quality instruction tuning data, as even advanced LLMs like GPT-3.5 exhibit limited performance on Verilog generation. Regarding this issue, we observe that (1) Verilog code collected from the real world has higher quality than those generated by LLMs. (2) LLMs like GPT-3.5 excel in summarizing Verilog code rather than generating it. Based on these observations, this paper introduces CodeV, a series of open-source instruction-tuned Verilog generation LLMs. Instead of generating descriptions first and then getting the corresponding code from advanced LLMs, we prompt the LLM with Verilog code and let the LLM generate the corresponding natural language description by multi-level summarization. Experimental results show that CodeV relatively surpasses the previous open-source SOTA by 14.4% (BetterV in VerilogEval) and 11.3% (RTLCoder in RTLLM) respectively, and also relatively outperforms previous commercial SOTA GPT-4 by 22.1% in VerilogEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10424v4-abstract-full').style.display = 'none'; document.getElementById('2407.10424v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09977">arXiv:2407.09977</a> <span> [<a href="https://arxiv.org/pdf/2407.09977">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Interpretation Bias in Rock Records with Large Language Models: Insights from Paleoenvironmental Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luoqi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haipeng Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Linshu Hu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiarui Cai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhong Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09977v1-abstract-short" style="display: inline;"> The reconstruction of Earth's history faces significant challenges due to the nonunique interpretations often derived from rock records. The problem has long been recognized but there are no systematic solutions in practice. This study introduces an innovative approach that leverages Large Language Models (LLMs) along with retrieval augmented generation and real-time search capabilities to counter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09977v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09977v1-abstract-full" style="display: none;"> The reconstruction of Earth's history faces significant challenges due to the nonunique interpretations often derived from rock records. The problem has long been recognized but there are no systematic solutions in practice. This study introduces an innovative approach that leverages Large Language Models (LLMs) along with retrieval augmented generation and real-time search capabilities to counteract interpretation biases, thereby enhancing the accuracy and reliability of geological analyses. By applying this framework to sedimentology and paleogeography, we demonstrate its effectiveness in mitigating interpretations biases through the generation and evaluation of multiple hypotheses for the same data, which can effectively reduce human bias. Our research illuminates the transformative potential of LLMs in refining paleoenvironmental studies and extends their applicability across various sub-disciplines of Earth sciences, enabling a deeper and more accurate depiction of Earth's evolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09977v1-abstract-full').style.display = 'none'; document.getElementById('2407.09977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09053">arXiv:2407.09053</a> <span> [<a href="https://arxiv.org/pdf/2407.09053">pdf</a>, <a href="https://arxiv.org/format/2407.09053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Navi2Gaze: Leveraging Foundation Models for Navigation and Target Gazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zihao Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haotian Xu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+F">Fengbo Lan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zilong Zheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bo Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09053v2-abstract-short" style="display: inline;"> Task-aware navigation continues to be a challenging area of research, especially in scenarios involving open vocabulary. Previous studies primarily focus on finding suitable locations for task completion, often overlooking the importance of the robot's pose. However, the robot's orientation is crucial for successfully completing tasks because of how objects are arranged (e.g., to open a refrigerat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09053v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09053v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09053v2-abstract-full" style="display: none;"> Task-aware navigation continues to be a challenging area of research, especially in scenarios involving open vocabulary. Previous studies primarily focus on finding suitable locations for task completion, often overlooking the importance of the robot's pose. However, the robot's orientation is crucial for successfully completing tasks because of how objects are arranged (e.g., to open a refrigerator door). Humans intuitively navigate to objects with the right orientation using semantics and common sense. For instance, when opening a refrigerator, we naturally stand in front of it rather than to the side. Recent advances suggest that Vision-Language Models (VLMs) can provide robots with similar common sense. Therefore, we develop a VLM-driven method called Navigation-to-Gaze (Navi2Gaze) for efficient navigation and object gazing based on task descriptions. This method uses the VLM to score and select the best pose from numerous candidates automatically. In evaluations on multiple photorealistic simulation benchmarks, Navi2Gaze significantly outperforms existing approaches by precisely determining the optimal orientation relative to target objects, resulting in a 68.8% reduction in Distance to Goal (DTG). Real-world video demonstrations can be found on the supplementary website <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09053v2-abstract-full').style.display = 'none'; document.getElementById('2407.09053v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08903">arXiv:2407.08903</a> <span> [<a href="https://arxiv.org/pdf/2407.08903">pdf</a>, <a href="https://arxiv.org/format/2407.08903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3622781.3674168">10.1145/3622781.3674168 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TensorTEE: Unifying Heterogeneous TEE Granularity for Efficient Secure Collaborative Tensor Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Husheng Han</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xinyao Zheng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuanbo Wen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yifan Hao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+E">Erhu Feng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Ling Liang</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+J">Jianan Mu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaqing Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianyun Ma</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengwei Jin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinkai Song</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08903v1-abstract-short" style="display: inline;"> Heterogeneous collaborative computing with NPU and CPU has received widespread attention due to its substantial performance benefits. To ensure data confidentiality and integrity during computing, Trusted Execution Environments (TEE) is considered a promising solution because of its comparatively lower overhead. However, existing heterogeneous TEE designs are inefficient for collaborative computin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08903v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08903v1-abstract-full" style="display: none;"> Heterogeneous collaborative computing with NPU and CPU has received widespread attention due to its substantial performance benefits. To ensure data confidentiality and integrity during computing, Trusted Execution Environments (TEE) is considered a promising solution because of its comparatively lower overhead. However, existing heterogeneous TEE designs are inefficient for collaborative computing due to fine and different memory granularities between CPU and NPU. 1) The cacheline granularity of CPU TEE intensifies memory pressure due to its extra memory access, and 2) the cacheline granularity MAC of NPU escalates the pressure on the limited memory storage. 3) Data transfer across heterogeneous enclaves relies on the transit of non-secure regions, resulting in cumbersome re-encryption and scheduling. To address these issues, we propose TensorTEE, a unified tensor-granularity heterogeneous TEE for efficient secure collaborative tensor computing. First, we virtually support tensor granularity in CPU TEE to eliminate the off-chip metadata access by detecting and maintaining tensor structures on-chip. Second, we propose tensor-granularity MAC management with predictive execution to avoid computational stalls while eliminating off-chip MAC storage and access. Moreover, based on the unified granularity, we enable direct data transfer without re-encryption and scheduling dilemmas. Our evaluation is built on enhanced Gem5 and a cycle-accurate NPU simulator. The results show that TensorTEE improves the performance of Large Language Model (LLM) training workloads by 4.0x compared to existing work and incurs only 2.1% overhead compared to non-secure training, offering a practical security assurance for LLM training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08903v1-abstract-full').style.display = 'none'; document.getElementById('2407.08903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ASPLOS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06841">arXiv:2407.06841</a> <span> [<a href="https://arxiv.org/pdf/2407.06841">pdf</a>, <a href="https://arxiv.org/format/2407.06841">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HTD-Mamba: Efficient Hyperspectral Target Detection with Pyramid State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+D">Dunbin Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuanbing Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiacheng Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianjun Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenrong Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaorui Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06841v2-abstract-short" style="display: inline;"> Hyperspectral target detection (HTD) identifies objects of interest from complex backgrounds at the pixel level, playing a vital role in Earth observation. However, HTD faces challenges due to limited prior knowledge and spectral variation, leading to underfitting models and unreliable performance. To address these challenges, this paper proposes an efficient self-supervised HTD method with a pyra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06841v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06841v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06841v2-abstract-full" style="display: none;"> Hyperspectral target detection (HTD) identifies objects of interest from complex backgrounds at the pixel level, playing a vital role in Earth observation. However, HTD faces challenges due to limited prior knowledge and spectral variation, leading to underfitting models and unreliable performance. To address these challenges, this paper proposes an efficient self-supervised HTD method with a pyramid state space model (SSM), named HTD-Mamba, which employs spectrally contrastive learning to distinguish between target and background based on the similarity measurement of intrinsic features. Specifically, to obtain sufficient training samples and leverage spatial contextual information, we propose a spatial-encoded spectral augmentation technique that encodes all surrounding pixels within a patch into a transformed view of the center pixel. Additionally, to explore global band correlations, we divide pixels into continuous group-wise spectral embeddings and introduce Mamba to HTD for the first time to model long-range dependencies of the spectral sequence with linear complexity. Furthermore, to alleviate spectral variation and enhance robust representation, we propose a pyramid SSM as a backbone to capture and fuse multiresolution spectral-wise intrinsic features. Extensive experiments conducted on four public datasets demonstrate that the proposed method outperforms state-of-the-art methods in both quantitative and qualitative evaluations. Code is available at \url{https://github.com/shendb2022/HTD-Mamba}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06841v2-abstract-full').style.display = 'none'; document.getElementById('2407.06841v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,6 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05700">arXiv:2407.05700</a> <span> [<a href="https://arxiv.org/pdf/2407.05700">pdf</a>, <a href="https://arxiv.org/format/2407.05700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> InverseCoder: Unleashing the Power of Instruction-Tuned Code LLMs with Inverse-Instruct </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yutong Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Wenxuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lingzhe Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shihao Liu</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Z">Ziyuan Nan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+K">Kaizhao Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yewen Pu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dawei Yin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05700v1-abstract-short" style="display: inline;"> Recent advancements in open-source code large language models (LLMs) have demonstrated remarkable coding abilities by fine-tuning on the data generated from powerful closed-source LLMs such as GPT-3.5 and GPT-4 for instruction tuning. This paper explores how to further improve an instruction-tuned code LLM by generating data from itself rather than querying closed-source LLMs. Our key observation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05700v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05700v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05700v1-abstract-full" style="display: none;"> Recent advancements in open-source code large language models (LLMs) have demonstrated remarkable coding abilities by fine-tuning on the data generated from powerful closed-source LLMs such as GPT-3.5 and GPT-4 for instruction tuning. This paper explores how to further improve an instruction-tuned code LLM by generating data from itself rather than querying closed-source LLMs. Our key observation is the misalignment between the translation of formal and informal languages: translating formal language (i.e., code) to informal language (i.e., natural language) is more straightforward than the reverse. Based on this observation, we propose INVERSE-INSTRUCT, which summarizes instructions from code snippets instead of the reverse. Specifically, given an instruction tuning corpus for code and the resulting instruction-tuned code LLM, we ask the code LLM to generate additional high-quality instructions for the original corpus through code summarization and self-evaluation. Then, we fine-tune the base LLM on the combination of the original corpus and the self-generated one, which yields a stronger instruction-tuned LLM. We present a series of code LLMs named InverseCoder, which surpasses the performance of the original code LLMs on a wide range of benchmarks, including Python text-to-code generation, multilingual coding, and data-science code generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05700v1-abstract-full').style.display = 'none'; document.getElementById('2407.05700v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05407">arXiv:2407.05407</a> <span> [<a href="https://arxiv.org/pdf/2407.05407">pdf</a>, <a href="https://arxiv.org/format/2407.05407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05407v2-abstract-short" style="display: inline;"> Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05407v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05407v2-abstract-full" style="display: none;"> Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role in LLM-based TTS models. Current speech tokens are learned in an unsupervised manner, which lacks explicit semantic information and alignment to the text. In this paper, we propose to represent speech with supervised semantic tokens, which are derived from a multilingual speech recognition model by inserting vector quantization into the encoder. Based on the tokens, we further propose a scalable zero-shot TTS synthesizer, CosyVoice, which consists of an LLM for text-to-token generation and a conditional flow matching model for token-to-speech synthesis. Experimental results show that supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning. Moreover, we find that utilizing large-scale data further improves the synthesis performance, indicating the scalable capacity of CosyVoice. To the best of our knowledge, this is the first attempt to involve supervised speech tokens into TTS models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05407v2-abstract-full').style.display = 'none'; document.getElementById('2407.05407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress. arXiv admin note: substantial text overlap with arXiv:2407.04051</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04051">arXiv:2407.04051</a> <span> [<a href="https://arxiv.org/pdf/2407.04051">pdf</a>, <a href="https://arxiv.org/format/2407.04051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+K">Keyu An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Ting He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zerui Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Changhe Song</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiaqi Shi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04051v3-abstract-short" style="display: inline;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04051v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04051v3-abstract-full" style="display: none;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, speaking style, and speaker identity. SenseVoice-Small delivers exceptionally low-latency ASR for 5 languages, and SenseVoice-Large supports high-precision ASR for over 50 languages, while CosyVoice excels in multi-lingual voice generation, zero-shot in-context learning, cross-lingual voice cloning, and instruction-following capabilities. The models related to SenseVoice and CosyVoice have been open-sourced on Modelscope and Huggingface, along with the corresponding training, inference, and fine-tuning codes released on GitHub. By integrating these models with LLMs, FunAudioLLM enables applications such as speech-to-speech translation, emotional voice chat, interactive podcasts, and expressive audiobook narration, thereby pushing the boundaries of voice interaction technology. Demos are available at https://fun-audio-llm.github.io, and the code can be accessed at https://github.com/FunAudioLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'none'; document.getElementById('2407.04051v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16743">arXiv:2406.16743</a> <span> [<a href="https://arxiv.org/pdf/2406.16743">pdf</a>, <a href="https://arxiv.org/format/2406.16743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Contrastive Decoding: Boosting Safety Alignment of Large Language Models via Opposite Prompt Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyue Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16743v1-abstract-short" style="display: inline;"> With the widespread application of Large Language Models (LLMs), it has become a significant concern to ensure their safety and prevent harmful responses. While current safe-alignment methods based on instruction fine-tuning and Reinforcement Learning from Human Feedback (RLHF) can effectively reduce harmful responses from LLMs, they often require high-quality datasets and heavy computational over… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16743v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16743v1-abstract-full" style="display: none;"> With the widespread application of Large Language Models (LLMs), it has become a significant concern to ensure their safety and prevent harmful responses. While current safe-alignment methods based on instruction fine-tuning and Reinforcement Learning from Human Feedback (RLHF) can effectively reduce harmful responses from LLMs, they often require high-quality datasets and heavy computational overhead during model training. Another way to align language models is to modify the logit of tokens in model outputs without heavy training. Recent studies have shown that contrastive decoding can enhance the performance of language models by reducing the likelihood of confused tokens. However, these methods require the manual selection of contrastive models or instruction templates. To this end, we propose Adversarial Contrastive Decoding (ACD), an optimization-based framework to generate two opposite system prompts for prompt-based contrastive decoding. ACD only needs to apply a lightweight prompt tuning on a rather small anchor dataset (< 3 min for each model) without training the target model. Experiments conducted on extensive models and benchmarks demonstrate that the proposed method achieves much better safety performance than previous model training-free decoding methods without sacrificing its original generation ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16743v1-abstract-full').style.display = 'none'; document.getElementById('2406.16743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16502">arXiv:2406.16502</a> <span> [<a href="https://arxiv.org/pdf/2406.16502">pdf</a>, <a href="https://arxiv.org/format/2406.16502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LOGCAN++: Adaptive Local-global class-aware network for semantic segmentation of remote sensing imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaowen Ma</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+R">Rongrong Lian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenkai Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongbo Guo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mengting Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sensen Wu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenhong Du</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siyang Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16502v2-abstract-short" style="display: inline;"> Remote sensing images usually characterized by complex backgrounds, scale and orientation variations, and large intra-class variance. General semantic segmentation methods usually fail to fully investigate the above issues, and thus their performances on remote sensing image segmentation are limited. In this paper, we propose our LOGCAN++, a semantic segmentation model customized for remote sensin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16502v2-abstract-full').style.display = 'inline'; document.getElementById('2406.16502v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16502v2-abstract-full" style="display: none;"> Remote sensing images usually characterized by complex backgrounds, scale and orientation variations, and large intra-class variance. General semantic segmentation methods usually fail to fully investigate the above issues, and thus their performances on remote sensing image segmentation are limited. In this paper, we propose our LOGCAN++, a semantic segmentation model customized for remote sensing images, which is made up of a Global Class Awareness (GCA) module and several Local Class Awareness (LCA) modules. The GCA module captures global representations for class-level context modeling to reduce the interference of background noise. The LCA module generates local class representations as intermediate perceptual elements to indirectly associate pixels with the global class representations, targeting at dealing with the large intra-class variance problem. In particular, we introduce affine transformations in the LCA module for adaptive extraction of local class representations to effectively tolerate scale and orientation variations in remotely sensed images. Extensive experiments on three benchmark datasets show that our LOGCAN++ outperforms current mainstream general and remote sensing semantic segmentation methods and achieves a better trade-off between speed and accuracy. Code is available at https://github.com/xwmaxwma/rssegmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16502v2-abstract-full').style.display = 'none'; document.getElementById('2406.16502v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14928">arXiv:2406.14928</a> <span> [<a href="https://arxiv.org/pdf/2406.14928">pdf</a>, <a href="https://arxiv.org/format/2406.14928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Agents for Collaborative Task under Information Asymmetry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zihao Xie</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R">Rennai Qiu</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yufan Dang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhuoyun Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chen Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14928v2-abstract-short" style="display: inline;"> Large Language Model Multi-Agent Systems (LLM-MAS) have achieved great progress in solving complex tasks. It performs communication among agents within the system to collaboratively solve tasks, under the premise of shared information. However, when agents' collaborations are leveraged to perform multi-person tasks, a new challenge arises due to information asymmetry, since each agent can only acc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14928v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14928v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14928v2-abstract-full" style="display: none;"> Large Language Model Multi-Agent Systems (LLM-MAS) have achieved great progress in solving complex tasks. It performs communication among agents within the system to collaboratively solve tasks, under the premise of shared information. However, when agents' collaborations are leveraged to perform multi-person tasks, a new challenge arises due to information asymmetry, since each agent can only access the information of its human user. Previous MAS struggle to complete tasks under this condition. To address this, we propose a new MAS paradigm termed iAgents, which denotes Informative Multi-Agent Systems. In iAgents, the human social network is mirrored in the agent network, where agents proactively exchange human information necessary for task resolution, thereby overcoming information asymmetry. iAgents employs a novel agent reasoning mechanism, InfoNav, to navigate agents' communication toward effective information exchange. Together with InfoNav, iAgents organizes human information in a mixed memory to provide agents with accurate and comprehensive information for exchange. Additionally, we introduce InformativeBench, the first benchmark tailored for evaluating LLM agents' task-solving ability under information asymmetry. Experimental results show that iAgents can collaborate within a social network of 140 individuals and 588 relationships, autonomously communicate over 30 turns, and retrieve information from nearly 70,000 messages to complete tasks within 3 minutes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14928v2-abstract-full').style.display = 'none'; document.getElementById('2406.14928v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 12 figures, 6 tables, accepted by NeurIPS 2024, see detail at https://thinkwee.top/iagents</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14912">arXiv:2406.14912</a> <span> [<a href="https://arxiv.org/pdf/2406.14912">pdf</a>, <a href="https://arxiv.org/format/2406.14912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FC3DNet: A Fully Connected Encoder-Decoder for Efficient Demoir'eing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhibo Du</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14912v1-abstract-short" style="display: inline;"> Moir茅 patterns are commonly seen when taking photos of screens. Camera devices usually have limited hardware performance but take high-resolution photos. However, users are sensitive to the photo processing time, which presents a hardly considered challenge of efficiency for demoir茅ing methods. To balance the network speed and quality of results, we propose a \textbf{F}ully \textbf{C}onnected en\t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14912v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14912v1-abstract-full" style="display: none;"> Moir茅 patterns are commonly seen when taking photos of screens. Camera devices usually have limited hardware performance but take high-resolution photos. However, users are sensitive to the photo processing time, which presents a hardly considered challenge of efficiency for demoir茅ing methods. To balance the network speed and quality of results, we propose a \textbf{F}ully \textbf{C}onnected en\textbf{C}oder-de\textbf{C}oder based \textbf{D}emoir茅ing \textbf{Net}work (FC3DNet). FC3DNet utilizes features with multiple scales in each stage of the decoder for comprehensive information, which contains long-range patterns as well as various local moir茅 styles that both are crucial aspects in demoir茅ing. Besides, to make full use of multiple features, we design a Multi-Feature Multi-Attention Fusion (MFMAF) module to weigh the importance of each feature and compress them for efficiency. These designs enable our network to achieve performance comparable to state-of-the-art (SOTA) methods in real-world datasets while utilizing only a fraction of parameters, FLOPs, and runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14912v1-abstract-full').style.display = 'none'; document.getElementById('2406.14912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICIP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12793">arXiv:2406.12793</a> <span> [<a href="https://arxiv.org/pdf/2406.12793">pdf</a>, <a href="https://arxiv.org/format/2406.12793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=GLM%2C+T">Team GLM</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Aohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bowen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Rojas%2C+D">Diego Rojas</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+G">Guanyu Feng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanyu Lai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiadai Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajie Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiale Cheng</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiayi Gui</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingyu Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juanzi Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lindong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Lucen Zhong</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12793v2-abstract-short" style="display: inline;"> We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most capable models that are trained with all the insights and lessons gained from the preceding three generations of ChatGLM. To date, the GLM-4 models are pre-trained… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12793v2-abstract-full').style.display = 'inline'; document.getElementById('2406.12793v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12793v2-abstract-full" style="display: none;"> We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most capable models that are trained with all the insights and lessons gained from the preceding three generations of ChatGLM. To date, the GLM-4 models are pre-trained on ten trillions of tokens mostly in Chinese and English, along with a small set of corpus from 24 languages, and aligned primarily for Chinese and English usage. The high-quality alignment is achieved via a multi-stage post-training process, which involves supervised fine-tuning and learning from human feedback. Evaluations show that GLM-4 1) closely rivals or outperforms GPT-4 in terms of general metrics such as MMLU, GSM8K, MATH, BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following as measured by IFEval, 3) matches GPT-4 Turbo (128K) and Claude 3 for long context tasks, and 4) outperforms GPT-4 in Chinese alignments as measured by AlignBench. The GLM-4 All Tools model is further aligned to understand user intent and autonomously decide when and which tool(s) touse -- including web browser, Python interpreter, text-to-image model, and user-defined functions -- to effectively complete complex tasks. In practical applications, it matches and even surpasses GPT-4 All Tools in tasks like accessing online information via web browsing and solving math problems using Python interpreter. Over the course, we have open-sourced a series of models, including ChatGLM-6B (three generations), GLM-4-9B (128K, 1M), GLM-4V-9B, WebGLM, and CodeGeeX, attracting over 10 million downloads on Hugging face in the year 2023 alone. The open models can be accessed through https://github.com/THUDM and https://huggingface.co/THUDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12793v2-abstract-full').style.display = 'none'; document.getElementById('2406.12793v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12746">arXiv:2406.12746</a> <span> [<a href="https://arxiv.org/pdf/2406.12746">pdf</a>, <a href="https://arxiv.org/format/2406.12746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Diversify, Rationalize, and Combine: Ensembling Multiple QA Strategies for Zero-shot Knowledge-based VQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Miaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoxin Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zilin Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12746v5-abstract-short" style="display: inline;"> Knowledge-based Visual Question-answering (K-VQA) often requires the use of background knowledge beyond the image. However, we discover that a single knowledge generation strategy is often insufficient for all K-VQA questions. To this end, we propose Diversification, Evidence Truncation, and Combination for Knowledge-based Elucidation (DietCoke), which utilizes a bundle of complementary question-a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12746v5-abstract-full').style.display = 'inline'; document.getElementById('2406.12746v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12746v5-abstract-full" style="display: none;"> Knowledge-based Visual Question-answering (K-VQA) often requires the use of background knowledge beyond the image. However, we discover that a single knowledge generation strategy is often insufficient for all K-VQA questions. To this end, we propose Diversification, Evidence Truncation, and Combination for Knowledge-based Elucidation (DietCoke), which utilizes a bundle of complementary question-answering tactics and aggregates their answers using textual rationales. DietCoke comprises of three stages: diversification, rationalization, and ensemble. The diversification stage generates three distinctive decision contexts, each leading to its own answer candidate. The rationalization stage generates two rationales, the automatic rationale and the mechanistic rationale, for each answer candidate using decorrelated techniques. Finally, in the ensemble stage, an LLM informed by the rationales selects one answer from the three candidates. Experiments show that DietCoke significantly outperforms state-of-the-art LLM-based baselines by 2.8% on OK-VOA and 4.7% on A-OKVOA and that the strategies in the ensembles are highly complementary. Code is available at: https://github.com/limiaoyu/DietCoke <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12746v5-abstract-full').style.display = 'none'; document.getElementById('2406.12746v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Findings of EMNLP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09950">arXiv:2406.09950</a> <span> [<a href="https://arxiv.org/pdf/2406.09950">pdf</a>, <a href="https://arxiv.org/format/2406.09950">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An efficient text augmentation approach for contextualized Mandarin speech recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+N">Naijun Zheng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xucheng Wan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Ziqing Du</a>, <a href="/search/cs?searchtype=author&query=Huan%2C+Z">Zhou Huan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09950v1-abstract-short" style="display: inline;"> Although contextualized automatic speech recognition (ASR) systems are commonly used to improve the recognition of uncommon words, their effectiveness is hindered by the inherent limitations of speech-text data availability. To address this challenge, our study proposes to leverage extensive text-only datasets and contextualize pre-trained ASR models using a straightforward text-augmentation (TA)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09950v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09950v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09950v1-abstract-full" style="display: none;"> Although contextualized automatic speech recognition (ASR) systems are commonly used to improve the recognition of uncommon words, their effectiveness is hindered by the inherent limitations of speech-text data availability. To address this challenge, our study proposes to leverage extensive text-only datasets and contextualize pre-trained ASR models using a straightforward text-augmentation (TA) technique, all while keeping computational costs minimal. In particular, to contextualize a pre-trained CIF-based ASR, we construct a codebook using limited speech-text data. By utilizing a simple codebook lookup process, we convert available text-only data into latent text embeddings. These embeddings then enhance the inputs for the contextualized ASR. Our experiments on diverse Mandarin test sets demonstrate that our TA approach significantly boosts recognition performance. The top-performing system shows relative CER improvements of up to 30% on rare words and 15% across all words in general. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09950v1-abstract-full').style.display = 'none'; document.getElementById('2406.09950v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08979">arXiv:2406.08979</a> <span> [<a href="https://arxiv.org/pdf/2406.08979">pdf</a>, <a href="https://arxiv.org/format/2406.08979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Software Development through Cross-Team Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhuoyun Du</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zihao Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yufan Dang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08979v1-abstract-short" style="display: inline;"> The latest breakthroughs in Large Language Models (LLMs), eg., ChatDev, have catalyzed profound transformations, particularly through multi-agent collaboration for software development. LLM agents can collaborate in teams like humans, and follow the waterfall model to sequentially work on requirements analysis, development, review, testing, and other phases to perform autonomous software generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08979v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08979v1-abstract-full" style="display: none;"> The latest breakthroughs in Large Language Models (LLMs), eg., ChatDev, have catalyzed profound transformations, particularly through multi-agent collaboration for software development. LLM agents can collaborate in teams like humans, and follow the waterfall model to sequentially work on requirements analysis, development, review, testing, and other phases to perform autonomous software generation. However, for an agent team, each phase in a single development process yields only one possible outcome. This results in the completion of only one development chain, thereby losing the opportunity to explore multiple potential decision paths within the solution space. Consequently, this may lead to obtaining suboptimal results. To address this challenge, we introduce Cross-Team Collaboration (CTC), a scalable multi-team framework that enables orchestrated teams to jointly propose various decisions and communicate with their insights in a cross-team collaboration environment for superior content generation. Experimental results in software development reveal a notable increase in quality compared to state-of-the-art baselines, underscoring the efficacy of our framework. The significant improvements in story generation demonstrate the promising generalization ability of our framework across various domains. We anticipate that our work will guide LLM agents towards a cross-team paradigm and contribute to their significant growth in but not limited to software development. The code and data will be available at https://github.com/OpenBMB/ChatDev. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08979v1-abstract-full').style.display = 'none'; document.getElementById('2406.08979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07155">arXiv:2406.07155</a> <span> [<a href="https://arxiv.org/pdf/2406.07155">pdf</a>, <a href="https://arxiv.org/format/2406.07155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Scaling Large-Language-Model-based Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zihao Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yufan Dang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhuoyun Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07155v1-abstract-short" style="display: inline;"> Pioneering advancements in large language model-powered agents have underscored the design pattern of multi-agent collaboration, demonstrating that collective intelligence can surpass the capabilities of each individual. Inspired by the neural scaling law, which posits that increasing neurons leads to emergent abilities, this study investigates whether a similar principle applies to increasing age… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07155v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07155v1-abstract-full" style="display: none;"> Pioneering advancements in large language model-powered agents have underscored the design pattern of multi-agent collaboration, demonstrating that collective intelligence can surpass the capabilities of each individual. Inspired by the neural scaling law, which posits that increasing neurons leads to emergent abilities, this study investigates whether a similar principle applies to increasing agents in multi-agent collaboration. Technically, we propose multi-agent collaboration networks (MacNet), which utilize directed acyclic graphs to organize agents and streamline their interactive reasoning via topological ordering, with solutions derived from their dialogues. Extensive experiments show that MacNet consistently outperforms baseline models, enabling effective agent collaboration across various network topologies and supporting cooperation among more than a thousand agents. Notably, we observed a small-world collaboration phenomenon, where topologies resembling small-world properties achieved superior performance. Additionally, we identified a collaborative scaling law, indicating that normalized solution quality follows a logistic growth pattern as scaling agents, with collaborative emergence occurring much earlier than previously observed instances of neural emergence. The code and data will be available at https://github.com/OpenBMB/ChatDev. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07155v1-abstract-full').style.display = 'none'; document.getElementById('2406.07155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress; The code and data will be available at https://github.com/OpenBMB/ChatDev</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03250">arXiv:2406.03250</a> <span> [<a href="https://arxiv.org/pdf/2406.03250">pdf</a>, <a href="https://arxiv.org/format/2406.03250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prompt-based Visual Alignment for Zero-shot Policy Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haihan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Q">Qi Yi</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Hantao Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haochen Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaming Guo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Shaohui Peng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yunkai Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">QiCheng Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuanbo Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Ling Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03250v1-abstract-short" style="display: inline;"> Overfitting in RL has become one of the main obstacles to applications in reinforcement learning(RL). Existing methods do not provide explicit semantic constrain for the feature extractor, hindering the agent from learning a unified cross-domain representation and resulting in performance degradation on unseen domains. Besides, abundant data from multiple domains are needed. To address these issue… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03250v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03250v1-abstract-full" style="display: none;"> Overfitting in RL has become one of the main obstacles to applications in reinforcement learning(RL). Existing methods do not provide explicit semantic constrain for the feature extractor, hindering the agent from learning a unified cross-domain representation and resulting in performance degradation on unseen domains. Besides, abundant data from multiple domains are needed. To address these issues, in this work, we propose prompt-based visual alignment (PVA), a robust framework to mitigate the detrimental domain bias in the image for zero-shot policy transfer. Inspired that Visual-Language Model (VLM) can serve as a bridge to connect both text space and image space, we leverage the semantic information contained in a text sequence as an explicit constraint to train a visual aligner. Thus, the visual aligner can map images from multiple domains to a unified domain and achieve good generalization performance. To better depict semantic information, prompt tuning is applied to learn a sequence of learnable tokens. With explicit constraints of semantic information, PVA can learn unified cross-domain representation under limited access to cross-domain data and achieves great zero-shot generalization ability in unseen domains. We verify PVA on a vision-based autonomous driving task with CARLA simulator. Experiments show that the agent generalizes well on unseen domains under limited access to multi-domain data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03250v1-abstract-full').style.display = 'none'; document.getElementById('2406.03250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by ICML2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19686">arXiv:2405.19686</a> <span> [<a href="https://arxiv.org/pdf/2405.19686">pdf</a>, <a href="https://arxiv.org/format/2405.19686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Graph Tuning: Real-time Large Language Model Personalization based on Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingwei Sun</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhixu Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19686v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable proficiency in a range of natural language processing tasks. Once deployed, LLMs encounter users with personalized factual knowledge, and such personalized knowledge is consistently reflected through users' interactions with the LLMs. To enhance user experience, real-time model personalization is essential, allowing LLMs to adapt user-speci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19686v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19686v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable proficiency in a range of natural language processing tasks. Once deployed, LLMs encounter users with personalized factual knowledge, and such personalized knowledge is consistently reflected through users' interactions with the LLMs. To enhance user experience, real-time model personalization is essential, allowing LLMs to adapt user-specific knowledge based on user feedback during human-LLM interactions. Existing methods mostly require back-propagation to finetune the model parameters, which incurs high computational and memory costs. In addition, these methods suffer from low interpretability, which will cause unforeseen impacts on model performance during long-term use, where the user's personalized knowledge is accumulated extensively.To address these challenges, we propose Knowledge Graph Tuning (KGT), a novel approach that leverages knowledge graphs (KGs) to personalize LLMs. KGT extracts personalized factual knowledge triples from users' queries and feedback and optimizes KGs without modifying the LLM parameters. Our method improves computational and memory efficiency by avoiding back-propagation and ensures interpretability by making the KG adjustments comprehensible to humans.Experiments with state-of-the-art LLMs, including GPT-2, Llama2, and Llama3, show that KGT significantly improves personalization performance while reducing latency and GPU memory costs. Ultimately, KGT offers a promising solution of effective, efficient, and interpretable real-time LLM personalization during user interactions with the LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19686v1-abstract-full').style.display = 'none'; document.getElementById('2405.19686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15414">arXiv:2405.15414</a> <span> [<a href="https://arxiv.org/pdf/2405.15414">pdf</a>, <a href="https://arxiv.org/format/2405.15414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Luban: Building Open-Ended Creative Agents via Autonomous Embodied Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuxuan Guo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Shaohui Peng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaming Guo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yifan Hao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Ling Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zikang Tian</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingju Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yutai Li</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yiming Gan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shuai Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15414v1-abstract-short" style="display: inline;"> Building open agents has always been the ultimate goal in AI research, and creative agents are the more enticing. Existing LLM agents excel at long-horizon tasks with well-defined goals (e.g., `mine diamonds' in Minecraft). However, they encounter difficulties on creative tasks with open goals and abstract criteria due to the inability to bridge the gap between them, thus lacking feedback for self… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15414v1-abstract-full').style.display = 'inline'; document.getElementById('2405.15414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15414v1-abstract-full" style="display: none;"> Building open agents has always been the ultimate goal in AI research, and creative agents are the more enticing. Existing LLM agents excel at long-horizon tasks with well-defined goals (e.g., `mine diamonds' in Minecraft). However, they encounter difficulties on creative tasks with open goals and abstract criteria due to the inability to bridge the gap between them, thus lacking feedback for self-improvement in solving the task. In this work, we introduce autonomous embodied verification techniques for agents to fill the gap, laying the groundwork for creative tasks. Specifically, we propose the Luban agent target creative building tasks in Minecraft, which equips with two-level autonomous embodied verification inspired by human design practices: (1) visual verification of 3D structural speculates, which comes from agent synthesized CAD modeling programs; (2) pragmatic verification of the creation by generating and verifying environment-relevant functionality programs based on the abstract criteria. Extensive multi-dimensional human studies and Elo ratings show that the Luban completes diverse creative building tasks in our proposed benchmark and outperforms other baselines ($33\%$ to $100\%$) in both visualization and pragmatism. Additional demos on the real-world robotic arm show the creation potential of the Luban in the physical world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15414v1-abstract-full').style.display = 'none'; document.getElementById('2405.15414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12892">arXiv:2405.12892</a> <span> [<a href="https://arxiv.org/pdf/2405.12892">pdf</a>, <a href="https://arxiv.org/format/2405.12892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Retrievable Domain-Sensitive Feature Memory for Multi-Domain Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhaocheng Du</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Q">Qinglin Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhenhua Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12892v1-abstract-short" style="display: inline;"> With the increase in the business scale and number of domains in online advertising, multi-domain ad recommendation has become a mainstream solution in the industry. The core of multi-domain recommendation is effectively modeling the commonalities and distinctions among domains. Existing works are dedicated to designing model architectures for implicit multi-domain modeling while overlooking an in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12892v1-abstract-full').style.display = 'inline'; document.getElementById('2405.12892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12892v1-abstract-full" style="display: none;"> With the increase in the business scale and number of domains in online advertising, multi-domain ad recommendation has become a mainstream solution in the industry. The core of multi-domain recommendation is effectively modeling the commonalities and distinctions among domains. Existing works are dedicated to designing model architectures for implicit multi-domain modeling while overlooking an in-depth investigation from a more fundamental perspective of feature distributions. This paper focuses on features with significant differences across various domains in both distributions and effects on model predictions. We refer to these features as domain-sensitive features, which serve as carriers of domain distinctions and are crucial for multi-domain modeling. Experiments demonstrate that existing multi-domain modeling methods may neglect domain-sensitive features, indicating insufficient learning of domain distinctions. To avoid this neglect, we propose a domain-sensitive feature attribution method to identify features that best reflect domain distinctions from the feature set. Further, we design a memory architecture that extracts domain-specific information from domain-sensitive features for the model to retrieve and integrate, thereby enhancing the awareness of domain distinctions. Extensive offline and online experiments demonstrate the superiority of our method in capturing domain distinctions and improving multi-domain recommendation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12892v1-abstract-full').style.display = 'none'; document.getElementById('2405.12892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11413">arXiv:2405.11413</a> <span> [<a href="https://arxiv.org/pdf/2405.11413">pdf</a>, <a href="https://arxiv.org/format/2405.11413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring speech style spaces with language models: Emotional TTS without emotion labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandra%2C+S+S">Shreeram Suresh Chandra</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zongyang Du</a>, <a href="/search/cs?searchtype=author&query=Sisman%2C+B">Berrak Sisman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11413v1-abstract-short" style="display: inline;"> Many frameworks for emotional text-to-speech (E-TTS) rely on human-annotated emotion labels that are often inaccurate and difficult to obtain. Learning emotional prosody implicitly presents a tough challenge due to the subjective nature of emotions. In this study, we propose a novel approach that leverages text awareness to acquire emotional styles without the need for explicit emotion labels or t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11413v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11413v1-abstract-full" style="display: none;"> Many frameworks for emotional text-to-speech (E-TTS) rely on human-annotated emotion labels that are often inaccurate and difficult to obtain. Learning emotional prosody implicitly presents a tough challenge due to the subjective nature of emotions. In this study, we propose a novel approach that leverages text awareness to acquire emotional styles without the need for explicit emotion labels or text prompts. We present TEMOTTS, a two-stage framework for E-TTS that is trained without emotion labels and is capable of inference without auxiliary inputs. Our proposed method performs knowledge transfer between the linguistic space learned by BERT and the emotional style space constructed by global style tokens. Our experimental results demonstrate the effectiveness of our proposed framework, showcasing improvements in emotional accuracy and naturalness. This is one of the first studies to leverage the emotional correlation between spoken content and expressive delivery for emotional TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11413v1-abstract-full').style.display = 'none'; document.getElementById('2405.11413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Speaker Odyssey 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04514">arXiv:2405.04514</a> <span> [<a href="https://arxiv.org/pdf/2405.04514">pdf</a>, <a href="https://arxiv.org/format/2405.04514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Scalable Circuit Cutting and Scheduling in a Resource-constrained and Distributed Quantum System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kan%2C+S">Shuwen Kan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zefan Du</a>, <a href="/search/cs?searchtype=author&query=Palma%2C+M">Miguel Palma</a>, <a href="/search/cs?searchtype=author&query=Stein%2C+S+A">Samuel A Stein</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenxu Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wenqi Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Juntao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Ying Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04514v1-abstract-short" style="display: inline;"> Despite quantum computing's rapid development, current systems remain limited in practical applications due to their limited qubit count and quality. Various technologies, such as superconducting, trapped ions, and neutral atom quantum computing technologies are progressing towards a fault tolerant era, however they all face a diverse set of challenges in scalability and control. Recent efforts ha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04514v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04514v1-abstract-full" style="display: none;"> Despite quantum computing's rapid development, current systems remain limited in practical applications due to their limited qubit count and quality. Various technologies, such as superconducting, trapped ions, and neutral atom quantum computing technologies are progressing towards a fault tolerant era, however they all face a diverse set of challenges in scalability and control. Recent efforts have focused on multi-node quantum systems that connect multiple smaller quantum devices to execute larger circuits. Future demonstrations hope to use quantum channels to couple systems, however current demonstrations can leverage classical communication with circuit cutting techniques. This involves cutting large circuits into smaller subcircuits and reconstructing them post-execution. However, existing cutting methods are hindered by lengthy search times as the number of qubits and gates increases. Additionally, they often fail to effectively utilize the resources of various worker configurations in a multi-node system. To address these challenges, we introduce FitCut, a novel approach that transforms quantum circuits into weighted graphs and utilizes a community-based, bottom-up approach to cut circuits according to resource constraints, e.g., qubit counts, on each worker. FitCut also includes a scheduling algorithm that optimizes resource utilization across workers. Implemented with Qiskit and evaluated extensively, FitCut significantly outperforms the Qiskit Circuit Knitting Toolbox, reducing time costs by factors ranging from 3 to 2000 and improving resource utilization rates by up to 3.88 times on the worker side, achieving a system-wide improvement of 2.86 times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04514v1-abstract-full').style.display = 'none'; document.getElementById('2405.04514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04490">arXiv:2405.04490</a> <span> [<a href="https://arxiv.org/pdf/2405.04490">pdf</a>, <a href="https://arxiv.org/format/2405.04490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Resource-Efficient and Self-Adaptive Quantum Search in a Quantum-Classical Hybrid System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zefan Du</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+S">Shaolun Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Juntao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yong Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Long Cheng</a>, <a href="/search/cs?searchtype=author&query=Buyya%2C+R">Rajkumar Buyya</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Ying Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04490v1-abstract-short" style="display: inline;"> Over the past decade, the rapid advancement of deep learning and big data applications has been driven by vast datasets and high-performance computing systems. However, as we approach the physical limits of semiconductor fabrication in the post-Moore's Law era, questions arise about the future of these applications. In parallel, quantum computing has made significant progress with the potential to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04490v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04490v1-abstract-full" style="display: none;"> Over the past decade, the rapid advancement of deep learning and big data applications has been driven by vast datasets and high-performance computing systems. However, as we approach the physical limits of semiconductor fabrication in the post-Moore's Law era, questions arise about the future of these applications. In parallel, quantum computing has made significant progress with the potential to break limits. Major companies like IBM, Google, and Microsoft provide access to noisy intermediate-scale quantum (NISQ) computers. Despite the theoretical promise of Shor's and Grover's algorithms, practical implementation on current quantum devices faces challenges, such as demanding additional resources and a high number of controlled operations. To tackle these challenges and optimize the utilization of limited onboard qubits, we introduce ReSaQuS, a resource-efficient index-value searching system within a quantum-classical hybrid framework. Building on Grover's algorithm, ReSaQuS employs an automatically managed iterative search approach. This method analyzes problem size, filters fewer probable data points, and progressively reduces the dataset with decreasing qubit requirements. Implemented using Qiskit and evaluated through extensive experiments, ReSaQuS has demonstrated a substantial reduction, up to 86.36\% in cumulative qubit consumption and 72.72\% in active periods, reinforcing its potential in optimizing quantum computing application deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04490v1-abstract-full').style.display = 'none'; document.getElementById('2405.04490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01730">arXiv:2405.01730</a> <span> [<a href="https://arxiv.org/pdf/2405.01730">pdf</a>, <a href="https://arxiv.org/format/2405.01730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Converting Anyone's Voice: End-to-End Expressive Voice Conversion with a Conditional Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zongyang Du</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junchen Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/cs?searchtype=author&query=Kaushik%2C+L">Lakshmish Kaushik</a>, <a href="/search/cs?searchtype=author&query=Sisman%2C+B">Berrak Sisman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01730v1-abstract-short" style="display: inline;"> Expressive voice conversion (VC) conducts speaker identity conversion for emotional speakers by jointly converting speaker identity and emotional style. Emotional style modeling for arbitrary speakers in expressive VC has not been extensively explored. Previous approaches have relied on vocoders for speech reconstruction, which makes speech quality heavily dependent on the performance of vocoders.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01730v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01730v1-abstract-full" style="display: none;"> Expressive voice conversion (VC) conducts speaker identity conversion for emotional speakers by jointly converting speaker identity and emotional style. Emotional style modeling for arbitrary speakers in expressive VC has not been extensively explored. Previous approaches have relied on vocoders for speech reconstruction, which makes speech quality heavily dependent on the performance of vocoders. A major challenge of expressive VC lies in emotion prosody modeling. To address these challenges, this paper proposes a fully end-to-end expressive VC framework based on a conditional denoising diffusion probabilistic model (DDPM). We utilize speech units derived from self-supervised speech models as content conditioning, along with deep features extracted from speech emotion recognition and speaker verification systems to model emotional style and speaker identity. Objective and subjective evaluations show the effectiveness of our framework. Codes and samples are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01730v1-abstract-full').style.display = 'none'; document.getElementById('2405.01730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Speaker Odyssey 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00393">arXiv:2405.00393</a> <span> [<a href="https://arxiv.org/pdf/2405.00393">pdf</a>, <a href="https://arxiv.org/format/2405.00393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Inferring State Machine from the Protocol Implementation via Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haiyang Wei</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhengjie Du</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haohui Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guang Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Linzhang Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+B">Bing Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00393v2-abstract-short" style="display: inline;"> State machines play a pivotal role in augmenting the efficacy of protocol analyzing to unveil more vulnerabilities. However, the task of inferring state machines from network protocol implementations presents significant challenges. Traditional methods based on dynamic analysis often overlook crucial state transitions due to limited coverage, while static analysis faces difficulties with complex c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00393v2-abstract-full').style.display = 'inline'; document.getElementById('2405.00393v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00393v2-abstract-full" style="display: none;"> State machines play a pivotal role in augmenting the efficacy of protocol analyzing to unveil more vulnerabilities. However, the task of inferring state machines from network protocol implementations presents significant challenges. Traditional methods based on dynamic analysis often overlook crucial state transitions due to limited coverage, while static analysis faces difficulties with complex code structures and behaviors. To address these limitations, we propose an innovative state machine inference approach powered by Large Language Models (LLMs). Utilizing text-embedding technology, this method allows LLMs to dissect and analyze the intricacies of protocol implementation code. Through targeted prompt engineering, we systematically identify and infer the underlying state machines. Our evaluation across six protocol implementations demonstrates the method's high efficacy, achieving an accuracy rate exceeding 90% and successfully delineating differences on state machines among various implementations of the same protocol. Importantly, integrating this approach with protocol fuzzing has notably enhanced AFLNet's code coverage by 10% over RFCNLP, showcasing the considerable potential of LLMs in advancing network protocol security analysis. Our proposed method not only marks a significant step forward in accurate state machine inference but also opens new avenues for improving the security and reliability of protocol implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00393v2-abstract-full').style.display = 'none'; document.getElementById('2405.00393v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16484">arXiv:2404.16484</a> <span> [<a href="https://arxiv.org/pdf/2404.16484">pdf</a>, <a href="https://arxiv.org/format/2404.16484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024 Challenge Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conde%2C+M+V">Marcos V. Conde</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhijun Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wen Li</a>, <a href="/search/cs?searchtype=author&query=Stejerean%2C+C">Cosmin Stejerean</a>, <a href="/search/cs?searchtype=author&query=Katsavounidis%2C+I">Ioannis Katsavounidis</a>, <a href="/search/cs?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kihwan Yoon</a>, <a href="/search/cs?searchtype=author&query=Gankhuyag%2C+G">Ganzorig Gankhuyag</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiangtao Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Long Sun</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jiangxin Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhui Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianle Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huaian Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yi Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Menghan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yiqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Si Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Biao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaoli Liu</a> , et al. (50 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16484v1-abstract-short" style="display: inline;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF cod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16484v1-abstract-full" style="display: none;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF codec, instead of JPEG. All the proposed methods improve PSNR fidelity over Lanczos interpolation, and process images under 10ms. Out of the 160 participants, 25 teams submitted their code and models. The solutions present novel designs tailored for memory-efficiency and runtime on edge devices. This survey describes the best solutions for real-time SR of compressed high-resolution images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'none'; document.getElementById('2404.16484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024, AI for Streaming (AIS) Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10343">arXiv:2404.10343</a> <span> [<a href="https://arxiv.org/pdf/2404.10343">pdf</a>, <a href="https://arxiv.org/format/2404.10343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+B">Bin Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yawei Li</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nancy Mehta</a>, <a href="/search/cs?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Cheng Wan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuxin Hong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bingnan Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhuoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yajun Zou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jizhe Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Keji He</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Chao Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+K">Kunlong Zuo</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+B">Bohao Liao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhibo Du</a>, <a href="/search/cs?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a> , et al. (109 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10343v2-abstract-short" style="display: inline;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10343v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10343v2-abstract-full" style="display: none;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such as runtime, parameters, and FLOPs, while still maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In addition, this challenge has 4 tracks including the main track (overall performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 (parameters). In the main track, all three metrics (ie runtime, FLOPs, and parameter count) were considered. The ranking of the main track is calculated based on a weighted sum-up of the scores of all other sub-tracks. In sub-track 1, the practical runtime performance of the submissions was evaluated, and the corresponding score was used to determine the ranking. In sub-track 2, the number of FLOPs was considered. The score calculated based on the corresponding FLOPs was used to determine the ranking. In sub-track 3, the number of parameters was considered. The score calculated based on the corresponding parameters was used to determine the ranking. RLFN is set as the baseline for efficiency measurement. The challenge had 262 registered participants, and 34 teams made valid submissions. They gauge the state-of-the-art in efficient single-image super-resolution. To facilitate the reproducibility of the challenge and enable other researchers to build upon these findings, the code and the pre-trained model of validated solutions are made publicly available at https://github.com/Amazingren/NTIRE2024_ESR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'none'; document.getElementById('2404.10343v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The report paper of NTIRE2024 Efficient Super-resolution, accepted by CVPRW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08687">arXiv:2404.08687</a> <span> [<a href="https://arxiv.org/pdf/2404.08687">pdf</a>, <a href="https://arxiv.org/format/2404.08687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Reasoning for Substitution Relationships: Definitions, Methods, and Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+A">Anxin Yang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhijuan Du</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08687v1-abstract-short" style="display: inline;"> Substitute relationships are fundamental to people's daily lives across various domains. This study aims to comprehend and predict substitute relationships among products in diverse fields, extensively analyzing the application of machine learning algorithms, natural language processing, and other technologies. By comparing model methodologies across different domains, such as defining substitutes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08687v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08687v1-abstract-full" style="display: none;"> Substitute relationships are fundamental to people's daily lives across various domains. This study aims to comprehend and predict substitute relationships among products in diverse fields, extensively analyzing the application of machine learning algorithms, natural language processing, and other technologies. By comparing model methodologies across different domains, such as defining substitutes, representing and learning substitute relationships, and substitute reasoning, this study offers a methodological foundation for delving deeper into substitute relationships. Through ongoing research and innovation, we can further refine the personalization and accuracy of substitute recommendation systems, thus advancing the development and application of this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08687v1-abstract-full').style.display = 'none'; document.getElementById('2404.08687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06033">arXiv:2404.06033</a> <span> [<a href="https://arxiv.org/pdf/2404.06033">pdf</a>, <a href="https://arxiv.org/format/2404.06033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3612561">10.1145/3581783.3612561 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for Multi-exposure Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+P">Pan Mu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhiying Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+C">Cong Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06033v2-abstract-short" style="display: inline;"> In recent years, deep learning networks have made remarkable strides in the domain of multi-exposure image fusion. Nonetheless, prevailing approaches often involve directly feeding over-exposed and under-exposed images into the network, which leads to the under-utilization of inherent information present in the source images. Additionally, unsupervised techniques predominantly employ rudimentary w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06033v2-abstract-full').style.display = 'inline'; document.getElementById('2404.06033v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06033v2-abstract-full" style="display: none;"> In recent years, deep learning networks have made remarkable strides in the domain of multi-exposure image fusion. Nonetheless, prevailing approaches often involve directly feeding over-exposed and under-exposed images into the network, which leads to the under-utilization of inherent information present in the source images. Additionally, unsupervised techniques predominantly employ rudimentary weighted summation for color channel processing, culminating in an overall desaturated final image tone. To partially mitigate these issues, this study proposes a gamma correction module specifically designed to fully leverage latent information embedded within source images. Furthermore, a modified transformer block, embracing with self-attention mechanisms, is introduced to optimize the fusion process. Ultimately, a novel color enhancement algorithm is presented to augment image saturation while preserving intricate details. The source code is available at https://github.com/ZhiyingDu/BHFMEF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06033v2-abstract-full').style.display = 'none'; document.getElementById('2404.06033v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 31st ACM International Conference on Multimedia, October 2023, Pages 2985-2993 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02893">arXiv:2404.02893</a> <span> [<a href="https://arxiv.org/pdf/2404.02893">pdf</a>, <a href="https://arxiv.org/format/2404.02893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChatGLM-Math: Improving Math Problem-Solving in Large Language Models with a Self-Critique Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinghan Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhenyu Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yueyan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Aohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhengxiao Du</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02893v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown excellent mastering of human language, but still struggle in real-world applications that require mathematical problem-solving. While many strategies and datasets to enhance LLMs' mathematics are developed, it remains a challenge to simultaneously maintain and improve both language and mathematical capabilities in deployed LLM systems.In this work, we tailor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02893v1-abstract-full').style.display = 'inline'; document.getElementById('2404.02893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02893v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown excellent mastering of human language, but still struggle in real-world applications that require mathematical problem-solving. While many strategies and datasets to enhance LLMs' mathematics are developed, it remains a challenge to simultaneously maintain and improve both language and mathematical capabilities in deployed LLM systems.In this work, we tailor the Self-Critique pipeline, which addresses the challenge in the feedback learning stage of LLM alignment. We first train a general Math-Critique model from the LLM itself to provide feedback signals. Then, we sequentially employ rejective fine-tuning and direct preference optimization over the LLM's own generations for data collection. Based on ChatGLM3-32B, we conduct a series of experiments on both academic and our newly created challenging dataset, MathUserEval. Results show that our pipeline significantly enhances the LLM's mathematical problem-solving while still improving its language ability, outperforming LLMs that could be two times larger. Related techniques have been deployed to ChatGLM\footnote{\url{https://chatglm.cn}}, an online serving LLM. Related evaluation dataset and scripts are released at \url{https://github.com/THUDM/ChatGLM-Math}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02893v1-abstract-full').style.display = 'none'; document.getElementById('2404.02893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00934">arXiv:2404.00934</a> <span> [<a href="https://arxiv.org/pdf/2404.00934">pdf</a>, <a href="https://arxiv.org/format/2404.00934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChatGLM-RLHF: Practices of Aligning Large Language Models with Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zhenyu Hou</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yilin Niu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhengxiao Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Aohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qinkai Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00934v2-abstract-short" style="display: inline;"> ChatGLM is a free-to-use AI service powered by the ChatGLM family of large language models (LLMs). In this paper, we present the ChatGLM-RLHF pipeline -- a reinforcement learning from human feedback (RLHF) system -- designed to enhance ChatGLM's alignment with human preferences. ChatGLM-RLHF encompasses three major components: the collection of human preference data, the training of the reward mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00934v2-abstract-full').style.display = 'inline'; document.getElementById('2404.00934v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00934v2-abstract-full" style="display: none;"> ChatGLM is a free-to-use AI service powered by the ChatGLM family of large language models (LLMs). In this paper, we present the ChatGLM-RLHF pipeline -- a reinforcement learning from human feedback (RLHF) system -- designed to enhance ChatGLM's alignment with human preferences. ChatGLM-RLHF encompasses three major components: the collection of human preference data, the training of the reward model, and the optimization of policies. Throughout the process of integrating ChatGLM-RLHF into production, we encountered and addressed several unprecedented challenges. We introduce the strategies to mitigate reward variance for stabilized large-scale training, implement model parallelism with fused gradient-descent, and design regularization constraints to avoid catastrophic forgetting in LLMs. Experiments show that ChatGLM-RLHF brings significant improvements in alignment tasks compared to the supervised fine-tuned (SFT) version of ChatGLM. For instance, it achieves on average 15\% more wins against ChatGLM-SFT in Chinese alignment tasks. The work presents our practices of aligning LLMs with human preferences, offering insights into the challenges and solutions in RLHF implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00934v2-abstract-full').style.display = 'none'; document.getElementById('2404.00934v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00621">arXiv:2404.00621</a> <span> [<a href="https://arxiv.org/pdf/2404.00621">pdf</a>, <a href="https://arxiv.org/format/2404.00621">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Pretraining, Adaptation, and Generation for Recommendation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qijiong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanting Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhaocheng Du</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiao-Ming Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhenhua Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00621v2-abstract-short" style="display: inline;"> Personalized recommendation serves as a ubiquitous channel for users to discover information tailored to their interests. However, traditional recommendation models primarily rely on unique IDs and categorical features for user-item matching, potentially overlooking the nuanced essence of raw item contents across multiple modalities such as text, image, audio, and video. This underutilization of m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00621v2-abstract-full').style.display = 'inline'; document.getElementById('2404.00621v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00621v2-abstract-full" style="display: none;"> Personalized recommendation serves as a ubiquitous channel for users to discover information tailored to their interests. However, traditional recommendation models primarily rely on unique IDs and categorical features for user-item matching, potentially overlooking the nuanced essence of raw item contents across multiple modalities such as text, image, audio, and video. This underutilization of multimodal data poses a limitation to recommender systems, especially in multimedia services like news, music, and short-video platforms. The recent advancements in large multimodal models offer new opportunities and challenges in developing content-aware recommender systems. This survey seeks to provide a comprehensive exploration of the latest advancements and future trajectories in multimodal pretraining, adaptation, and generation techniques, as well as their applications in enhancing recommender systems. Furthermore, we discuss current open challenges and opportunities for future research in this dynamic domain. We believe that this survey, alongside the curated resources, will provide valuable insights to inspire further advancements in this evolving landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00621v2-abstract-full').style.display = 'none'; document.getElementById('2404.00621v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by KDD 2024. See our tutorial materials at https://mmrec.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16135">arXiv:2403.16135</a> <span> [<a href="https://arxiv.org/pdf/2403.16135">pdf</a>, <a href="https://arxiv.org/format/2403.16135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Complementary Recommendation in E-commerce: Definition, Approaches, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyue Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhijuan Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16135v1-abstract-short" style="display: inline;"> In recent years, complementary recommendation has received extensive attention in the e-commerce domain. In this paper, we comprehensively summarize and compare 34 representative studies conducted between 2009 and 2024. Firstly, we compare the data and methods used for modeling complementary relationships between products, including simple complementarity and more complex scenarios such as asymmet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16135v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16135v1-abstract-full" style="display: none;"> In recent years, complementary recommendation has received extensive attention in the e-commerce domain. In this paper, we comprehensively summarize and compare 34 representative studies conducted between 2009 and 2024. Firstly, we compare the data and methods used for modeling complementary relationships between products, including simple complementarity and more complex scenarios such as asymmetric complementarity, the coexistence of substitution and complementarity relationships between products, and varying degrees of complementarity between different pairs of products. Next, we classify and compare the models based on the research problems of complementary recommendation, such as diversity, personalization, and cold-start. Furthermore, we provide a comparative analysis of experimental results from different studies conducted on the same dataset, which helps identify the strengths and weaknesses of the research. Compared to previous surveys, this paper provides a more updated and comprehensive summary of the research, discusses future research directions, and contributes to the advancement of this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16135v1-abstract-full').style.display = 'none'; document.getElementById('2403.16135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages,9 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Du%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository