Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 139 results for author: <span class="mathjax">Shi, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shi%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shi, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shi%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shi, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shi%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shi%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14405">arXiv:2411.14405</a> <span> [<a href="https://arxiv.org/pdf/2411.14405">pdf</a>, <a href="https://arxiv.org/format/2411.14405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Huifeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bo Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianqi Shi</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chenyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weihua Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14405v2-abstract-short" style="display: inline;"> Currently OpenAI o1 sparks a surge of interest in the study of large reasoning models (LRM). Building on this momentum, Marco-o1 not only focuses on disciplines with standard answers, such as mathematics, physics, and coding -- which are well-suited for reinforcement learning (RL) -- but also places greater emphasis on open-ended resolutions. We aim to address the question: ''Can the o1 model effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14405v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14405v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14405v2-abstract-full" style="display: none;"> Currently OpenAI o1 sparks a surge of interest in the study of large reasoning models (LRM). Building on this momentum, Marco-o1 not only focuses on disciplines with standard answers, such as mathematics, physics, and coding -- which are well-suited for reinforcement learning (RL) -- but also places greater emphasis on open-ended resolutions. We aim to address the question: ''Can the o1 model effectively generalize to broader domains where clear standards are absent and rewards are challenging to quantify?'' Marco-o1 is powered by Chain-of-Thought (CoT) fine-tuning, Monte Carlo Tree Search (MCTS), reflection mechanisms, and innovative reasoning strategies -- optimized for complex real-world problem-solving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14405v2-abstract-full').style.display = 'none'; document.getElementById('2411.14405v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01114">arXiv:2411.01114</a> <span> [<a href="https://arxiv.org/pdf/2411.01114">pdf</a>, <a href="https://arxiv.org/format/2411.01114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Infant Agent: A Tool-Integrated, Logic-Driven Agent with Cost-Effective API Usage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+B">Bin Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuchen Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yiming Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+T">Tao Ren</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zitian Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zeyu Hu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+W">Weitai Kang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiuwu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01114v1-abstract-short" style="display: inline;"> Despite the impressive capabilities of large language models (LLMs), they currently exhibit two primary limitations, \textbf{\uppercase\expandafter{\romannumeral 1}}: They struggle to \textbf{autonomously solve the real world engineering problem}. \textbf{\uppercase\expandafter{\romannumeral 2}}: They remain \textbf{challenged in reasoning through complex logic problems}. To address these challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01114v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01114v1-abstract-full" style="display: none;"> Despite the impressive capabilities of large language models (LLMs), they currently exhibit two primary limitations, \textbf{\uppercase\expandafter{\romannumeral 1}}: They struggle to \textbf{autonomously solve the real world engineering problem}. \textbf{\uppercase\expandafter{\romannumeral 2}}: They remain \textbf{challenged in reasoning through complex logic problems}. To address these challenges, we developed the \textsc{Infant Agent}, integrating task-aware functions, operators, a hierarchical management system, and a memory retrieval mechanism. Together, these components enable large language models to sustain extended reasoning processes and handle complex, multi-step tasks efficiently, all while significantly reducing API costs. Using the \textsc{Infant Agent}, GPT-4o's accuracy on the SWE-bench-lite dataset rises from $\mathbf{0.33\%}$ to $\mathbf{30\%}$, and in the AIME-2024 mathematics competition, it increases GPT-4o's accuracy from $\mathbf{13.3\%}$ to $\mathbf{37\%}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01114v1-abstract-full').style.display = 'none'; document.getElementById('2411.01114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21349">arXiv:2410.21349</a> <span> [<a href="https://arxiv.org/pdf/2410.21349">pdf</a>, <a href="https://arxiv.org/format/2410.21349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> FALCON: Feedback-driven Adaptive Long/short-term memory reinforced Coding Optimization system </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeyuan Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lewei He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+B">Bin Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuchen Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiuwu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21349v2-abstract-short" style="display: inline;"> Recently, large language models (LLMs) have achieved significant progress in automated code generation. Despite their strong instruction-following capabilities, these models frequently struggled to align with user intent in coding scenarios. In particular, they were hampered by datasets that lacked diversity and failed to address specialized tasks or edge cases. Furthermore, challenges in supervis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21349v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21349v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21349v2-abstract-full" style="display: none;"> Recently, large language models (LLMs) have achieved significant progress in automated code generation. Despite their strong instruction-following capabilities, these models frequently struggled to align with user intent in coding scenarios. In particular, they were hampered by datasets that lacked diversity and failed to address specialized tasks or edge cases. Furthermore, challenges in supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF) led to failures in generating precise, human-intent-aligned code. To tackle these challenges and improve the code generation performance for automated programming systems, we propose Feedback-driven Adaptive Long/short-term memory reinforced Coding Optimization (i.e., FALCON). FALCON is structured into two hierarchical levels. From the global level, long-term memory improves code quality by retaining and applying learned knowledge. At the local level, short-term memory allows for the incorporation of immediate feedback from compilers and AI systems. Additionally, we introduce meta-reinforcement learning with feedback rewards to solve the global-local bi-level optimization problem and enhance the model's adaptability across diverse code generation tasks. Extensive experiments demonstrate that our technique achieves state-of-the-art performance, leading other reinforcement learning methods by more than 4.5 percentage points on the MBPP benchmark and 6.1 percentage points on the Humaneval benchmark. The open-sourced code is publicly available at https://github.com/titurte/FALCON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21349v2-abstract-full').style.display = 'none'; document.getElementById('2410.21349v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16795">arXiv:2410.16795</a> <span> [<a href="https://arxiv.org/pdf/2410.16795">pdf</a>, <a href="https://arxiv.org/format/2410.16795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Traj-Explainer: An Explainable and Robust Multi-modal Trajectory Prediction Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiqun Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Ziyuan Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16795v1-abstract-short" style="display: inline;"> Navigating complex traffic environments has been significantly enhanced by advancements in intelligent technologies, enabling accurate environment perception and trajectory prediction for automated vehicles. However, existing research often neglects the consideration of the joint reasoning of scenario agents and lacks interpretability in trajectory prediction models, thereby limiting their practic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16795v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16795v1-abstract-full" style="display: none;"> Navigating complex traffic environments has been significantly enhanced by advancements in intelligent technologies, enabling accurate environment perception and trajectory prediction for automated vehicles. However, existing research often neglects the consideration of the joint reasoning of scenario agents and lacks interpretability in trajectory prediction models, thereby limiting their practical application in real-world scenarios. To this purpose, an explainability-oriented trajectory prediction model is designed in this work, named Explainable Conditional Diffusion based Multimodal Trajectory Prediction Traj-Explainer, to retrieve the influencing factors of prediction and help understand the intrinsic mechanism of prediction. In Traj-Explainer, a modified conditional diffusion is well designed to capture the scenario multimodal trajectory pattern, and meanwhile, a modified Shapley Value model is assembled to rationally learn the importance of the global and scenario features. Numerical experiments are carried out by several trajectory prediction datasets, including Waymo, NGSIM, HighD, and MoCAD datasets. Furthermore, we evaluate the identified input factors which indicates that they are in agreement with the human driving experience, indicating the capability of the proposed model in appropriately learning the prediction. Code available in our open-source repository: \url{https://anonymous.4open.science/r/Interpretable-Prediction}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16795v1-abstract-full').style.display = 'none'; document.getElementById('2410.16795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06558">arXiv:2410.06558</a> <span> [<a href="https://arxiv.org/pdf/2410.06558">pdf</a>, <a href="https://arxiv.org/format/2410.06558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Correlated Prompting for Visual Recognition with Missing Modalities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lianyu Hu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tongkai Shi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+F">Fanhua Shang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+L">Liang Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06558v4-abstract-short" style="display: inline;"> Large-scale multimodal models have shown excellent performance over a series of tasks powered by the large corpus of paired multimodal training data. Generally, they are always assumed to receive modality-complete inputs. However, this simple assumption may not always hold in the real world due to privacy constraints or collection difficulty, where models pretrained on modality-complete data easil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06558v4-abstract-full').style.display = 'inline'; document.getElementById('2410.06558v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06558v4-abstract-full" style="display: none;"> Large-scale multimodal models have shown excellent performance over a series of tasks powered by the large corpus of paired multimodal training data. Generally, they are always assumed to receive modality-complete inputs. However, this simple assumption may not always hold in the real world due to privacy constraints or collection difficulty, where models pretrained on modality-complete data easily demonstrate degraded performance on missing-modality cases. To handle this issue, we refer to prompt learning to adapt large pretrained multimodal models to handle missing-modality scenarios by regarding different missing cases as different types of input. Instead of only prepending independent prompts to the intermediate layers, we present to leverage the correlations between prompts and input features and excavate the relationships between different layers of prompts to carefully design the instructions. We also incorporate the complementary semantics of different modalities to guide the prompting design for each modality. Extensive experiments on three commonly-used datasets consistently demonstrate the superiority of our method compared to the previous approaches upon different missing scenarios. Plentiful ablations are further given to show the generalizability and reliability of our method upon different modality-missing ratios and types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06558v4-abstract-full').style.display = 'none'; document.getElementById('2410.06558v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024, add some results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18014">arXiv:2409.18014</a> <span> [<a href="https://arxiv.org/pdf/2409.18014">pdf</a>, <a href="https://arxiv.org/format/2409.18014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Role-RL: Online Long-Context Processing with Role Reinforcement Learning for Distinct LLMs in Their Optimal Roles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+L">Lewei He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Pengran Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qianglong Chen</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jiahui Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18014v1-abstract-short" style="display: inline;"> Large language models (LLMs) with long-context processing are still challenging because of their implementation complexity, training efficiency and data sparsity. To address this issue, a new paradigm named Online Long-context Processing (OLP) is proposed when we process a document of unlimited length, which typically occurs in the information reception and organization of diverse streaming media… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18014v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18014v1-abstract-full" style="display: none;"> Large language models (LLMs) with long-context processing are still challenging because of their implementation complexity, training efficiency and data sparsity. To address this issue, a new paradigm named Online Long-context Processing (OLP) is proposed when we process a document of unlimited length, which typically occurs in the information reception and organization of diverse streaming media such as automated news reporting, live e-commerce, and viral short videos. Moreover, a dilemma was often encountered when we tried to select the most suitable LLM from a large number of LLMs amidst explosive growth aiming for outstanding performance, affordable prices, and short response delays. In view of this, we also develop Role Reinforcement Learning (Role-RL) to automatically deploy different LLMs in their respective roles within the OLP pipeline according to their actual performance. Extensive experiments are conducted on our OLP-MINI dataset and it is found that OLP with Role-RL framework achieves OLP benchmark with an average recall rate of 93.2% and the LLM cost saved by 79.4%. The code and dataset are publicly available at: https://anonymous.4open.science/r/Role-RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18014v1-abstract-full').style.display = 'none'; document.getElementById('2409.18014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16709">arXiv:2409.16709</a> <span> [<a href="https://arxiv.org/pdf/2409.16709">pdf</a>, <a href="https://arxiv.org/format/2409.16709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pose-Guided Fine-Grained Sign Language Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tongkai Shi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lianyu Hu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+F">Fanhua Shang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jichao Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peidong Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16709v1-abstract-short" style="display: inline;"> Sign language videos are an important medium for spreading and learning sign language. However, most existing human image synthesis methods produce sign language images with details that are distorted, blurred, or structurally incorrect. They also produce sign language video frames with poor temporal consistency, with anomalies such as flickering and abrupt detail changes between the previous and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16709v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16709v1-abstract-full" style="display: none;"> Sign language videos are an important medium for spreading and learning sign language. However, most existing human image synthesis methods produce sign language images with details that are distorted, blurred, or structurally incorrect. They also produce sign language video frames with poor temporal consistency, with anomalies such as flickering and abrupt detail changes between the previous and next frames. To address these limitations, we propose a novel Pose-Guided Motion Model (PGMM) for generating fine-grained and motion-consistent sign language videos. Firstly, we propose a new Coarse Motion Module (CMM), which completes the deformation of features by optical flow warping, thus transfering the motion of coarse-grained structures without changing the appearance; Secondly, we propose a new Pose Fusion Module (PFM), which guides the modal fusion of RGB and pose features, thus completing the fine-grained generation. Finally, we design a new metric, Temporal Consistency Difference (TCD) to quantitatively assess the degree of temporal consistency of a video by comparing the difference between the frames of the reconstructed video and the previous and next frames of the target video. Extensive qualitative and quantitative experiments show that our method outperforms state-of-the-art methods in most benchmark tests, with visible improvements in details and temporal consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16709v1-abstract-full').style.display = 'none'; document.getElementById('2409.16709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07464">arXiv:2409.07464</a> <span> [<a href="https://arxiv.org/pdf/2409.07464">pdf</a>, <a href="https://arxiv.org/format/2409.07464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reflective Human-Machine Co-adaptation for Enhanced Text-to-Image Generation Dialogue System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuheng Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinsong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07464v1-abstract-short" style="display: inline;"> Today's image generation systems are capable of producing realistic and high-quality images. However, user prompts often contain ambiguities, making it difficult for these systems to interpret users' potential intentions. Consequently, machines need to interact with users multiple rounds to better understand users' intents. The unpredictable costs of using or learning image generation models throu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07464v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07464v1-abstract-full" style="display: none;"> Today's image generation systems are capable of producing realistic and high-quality images. However, user prompts often contain ambiguities, making it difficult for these systems to interpret users' potential intentions. Consequently, machines need to interact with users multiple rounds to better understand users' intents. The unpredictable costs of using or learning image generation models through multiple feedback interactions hinder their widespread adoption and full performance potential, especially for non-expert users. In this research, we aim to enhance the user-friendliness of our image generation system. To achieve this, we propose a reflective human-machine co-adaptation strategy, named RHM-CAS. Externally, the Agent engages in meaningful language interactions with users to reflect on and refine the generated images. Internally, the Agent tries to optimize the policy based on user preferences, ensuring that the final outcomes closely align with user preferences. Various experiments on different tasks demonstrate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07464v1-abstract-full').style.display = 'none'; document.getElementById('2409.07464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00872">arXiv:2409.00872</a> <span> [<a href="https://arxiv.org/pdf/2409.00872">pdf</a>, <a href="https://arxiv.org/format/2409.00872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-evolving Agents with reflective and memory-augmented abilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">JingSong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00872v1-abstract-short" style="display: inline;"> Large language models (LLMs) have made significant advances in the field of natural language processing, but they still face challenges such as continuous decision-making. In this research, we propose a novel framework by integrating iterative feedback, reflective mechanisms, and a memory optimization mechanism based on the Ebbinghaus forgetting curve, it significantly enhances the agents' capabil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00872v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00872v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00872v1-abstract-full" style="display: none;"> Large language models (LLMs) have made significant advances in the field of natural language processing, but they still face challenges such as continuous decision-making. In this research, we propose a novel framework by integrating iterative feedback, reflective mechanisms, and a memory optimization mechanism based on the Ebbinghaus forgetting curve, it significantly enhances the agents' capabilities in handling multi-tasking and long-span information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00872v1-abstract-full').style.display = 'none'; document.getElementById('2409.00872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00855">arXiv:2409.00855</a> <span> [<a href="https://arxiv.org/pdf/2409.00855">pdf</a>, <a href="https://arxiv.org/format/2409.00855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> LanguaShrink: Reducing Token Overhead with Psycholinguistics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">JingSong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00855v1-abstract-short" style="display: inline;"> As large language models (LLMs) improve their capabilities in handling complex tasks, the issues of computational cost and efficiency due to long prompts are becoming increasingly prominent. To accelerate model inference and reduce costs, we propose an innovative prompt compression framework called LanguaShrink. Inspired by the observation that LLM performance depends on the density and position o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00855v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00855v1-abstract-full" style="display: none;"> As large language models (LLMs) improve their capabilities in handling complex tasks, the issues of computational cost and efficiency due to long prompts are becoming increasingly prominent. To accelerate model inference and reduce costs, we propose an innovative prompt compression framework called LanguaShrink. Inspired by the observation that LLM performance depends on the density and position of key information in the input prompts, LanguaShrink leverages psycholinguistic principles and the Ebbinghaus memory curve to achieve task-agnostic prompt compression. This effectively reduces prompt length while preserving essential information. We referred to the training method of OpenChat.The framework introduces part-of-speech priority compression and data distillation techniques, using smaller models to learn compression targets and employing a KL-regularized reinforcement learning strategy for training.\cite{wang2023openchat} Additionally, we adopt a chunk-based compression algorithm to achieve adjustable compression rates. We evaluate our method on multiple datasets, including LongBench, ZeroScrolls, Arxiv Articles, and a newly constructed novel test set. Experimental results show that LanguaShrink maintains semantic similarity while achieving up to 26 times compression. Compared to existing prompt compression methods, LanguaShrink improves end-to-end latency by 1.43 times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00855v1-abstract-full').style.display = 'none'; document.getElementById('2409.00855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15549">arXiv:2408.15549</a> <span> [<a href="https://arxiv.org/pdf/2408.15549">pdf</a>, <a href="https://arxiv.org/format/2408.15549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WildFeedback: Aligning LLMs With In-situ User Interactions And Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Taiwei Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoer Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Longqi Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Ying-Chun Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zexue He</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mengting Wan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pei Zhou</a>, <a href="/search/cs?searchtype=author&query=Jauhar%2C+S">Sujay Jauhar</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaofeng Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xia Song</a>, <a href="/search/cs?searchtype=author&query=Neville%2C+J">Jennifer Neville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15549v1-abstract-short" style="display: inline;"> As large language models (LLMs) continue to advance, aligning these models with human preferences has emerged as a critical challenge. Traditional alignment methods, relying on human or LLM annotated datasets, are limited by their resource-intensive nature, inherent subjectivity, and the risk of feedback loops that amplify model biases. To overcome these limitations, we introduce WildFeedback, a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15549v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15549v1-abstract-full" style="display: none;"> As large language models (LLMs) continue to advance, aligning these models with human preferences has emerged as a critical challenge. Traditional alignment methods, relying on human or LLM annotated datasets, are limited by their resource-intensive nature, inherent subjectivity, and the risk of feedback loops that amplify model biases. To overcome these limitations, we introduce WildFeedback, a novel framework that leverages real-time, in-situ user interactions to create preference datasets that more accurately reflect authentic human values. WildFeedback operates through a three-step process: feedback signal identification, preference data construction, and user-guided evaluation. We applied this framework to a large corpus of user-LLM conversations, resulting in a rich preference dataset that reflects genuine user preferences. This dataset captures the nuances of user preferences by identifying and classifying feedback signals within natural conversations, thereby enabling the construction of more representative and context-sensitive alignment data. Our extensive experiments demonstrate that LLMs fine-tuned on WildFeedback exhibit significantly improved alignment with user preferences, as evidenced by both traditional benchmarks and our proposed user-guided evaluation. By incorporating real-time feedback from actual users, WildFeedback addresses the scalability, subjectivity, and bias challenges that plague existing approaches, marking a significant step toward developing LLMs that are more responsive to the diverse and evolving needs of their users. In summary, WildFeedback offers a robust, scalable solution for aligning LLMs with true human values, setting a new standard for the development and evaluation of user-centric language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15549v1-abstract-full').style.display = 'none'; document.getElementById('2408.15549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08209">arXiv:2408.08209</a> <span> [<a href="https://arxiv.org/pdf/2408.08209">pdf</a>, <a href="https://arxiv.org/format/2408.08209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Modeling Domain and Feedback Transitions for Cross-Domain Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changshuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Teng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08209v1-abstract-short" style="display: inline;"> Nowadays, many recommender systems encompass various domains to cater to users' diverse needs, leading to user behaviors transitioning across different domains. In fact, user behaviors across different domains reveal changes in preference toward recommended items. For instance, a shift from negative feedback to positive feedback indicates improved user satisfaction. However, existing cross-domain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08209v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08209v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08209v1-abstract-full" style="display: none;"> Nowadays, many recommender systems encompass various domains to cater to users' diverse needs, leading to user behaviors transitioning across different domains. In fact, user behaviors across different domains reveal changes in preference toward recommended items. For instance, a shift from negative feedback to positive feedback indicates improved user satisfaction. However, existing cross-domain sequential recommendation methods typically model user interests by focusing solely on information about domain transitions, often overlooking the valuable insights provided by users' feedback transitions. In this paper, we propose $\text{Transition}^2$, a novel method to model transitions across both domains and types of user feedback. Specifically, $\text{Transition}^2$ introduces a transition-aware graph encoder based on user history, assigning different weights to edges according to the feedback type. This enables the graph encoder to extract historical embeddings that capture the transition information between different domains and feedback types. Subsequently, we encode the user history using a cross-transition multi-head self-attention, incorporating various masks to distinguish different types of transitions. Finally, we integrate these modules to make predictions across different domains. Experimental results on two public datasets demonstrate the effectiveness of $\text{Transition}^2$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08209v1-abstract-full').style.display = 'none'; document.getElementById('2408.08209v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07791">arXiv:2408.07791</a> <span> [<a href="https://arxiv.org/pdf/2408.07791">pdf</a>, <a href="https://arxiv.org/format/2408.07791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Efficient and Explanatory Image and Text Clustering System with Multimodal Autoencoder Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tiancheng Shi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuanchen Wei</a>, <a href="/search/cs?searchtype=author&query=Kender%2C+J+R">John R. Kender</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07791v1-abstract-short" style="display: inline;"> We demonstrate the efficiencies and explanatory abilities of extensions to the common tools of Autoencoders and LLM interpreters, in the novel context of comparing different cultural approaches to the same international news event. We develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model that extends the modalities of previous CVAE models, by using fully-connected latent laye… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07791v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07791v1-abstract-full" style="display: none;"> We demonstrate the efficiencies and explanatory abilities of extensions to the common tools of Autoencoders and LLM interpreters, in the novel context of comparing different cultural approaches to the same international news event. We develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model that extends the modalities of previous CVAE models, by using fully-connected latent layers to embed in parallel the CNN encodings of video frames, together with the LSTM encodings of their related text derived from audio. We incorporate the model within a larger system that includes frame-caption alignment, latent space vector clustering, and a novel LLM-based cluster interpreter. We measure, tune, and apply this system to the task of summarizing a video into three to five thematic clusters, with each theme described by ten LLM-produced phrases. We apply this system to two news topics, COVID-19 and the Winter Olympics, and five other topics are in progress. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07791v1-abstract-full').style.display = 'none'; document.getElementById('2408.07791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04998">arXiv:2408.04998</a> <span> [<a href="https://arxiv.org/pdf/2408.04998">pdf</a>, <a href="https://arxiv.org/format/2408.04998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ProFuser: Progressive Fusion of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+F">Fanqi Wan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Canbin Huang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+X">Xiaojun Quan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenliang Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04998v1-abstract-short" style="display: inline;"> While fusing the capacities and advantages of various large language models (LLMs) offers a pathway to construct more powerful and versatile models, a fundamental challenge is to properly select advantageous model during the training. Existing fusion methods primarily focus on the training mode that uses cross entropy on ground truth in a teacher-forcing setup to measure a model's advantage, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04998v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04998v1-abstract-full" style="display: none;"> While fusing the capacities and advantages of various large language models (LLMs) offers a pathway to construct more powerful and versatile models, a fundamental challenge is to properly select advantageous model during the training. Existing fusion methods primarily focus on the training mode that uses cross entropy on ground truth in a teacher-forcing setup to measure a model's advantage, which may provide limited insight towards model advantage. In this paper, we introduce a novel approach that enhances the fusion process by incorporating both the training and inference modes. Our method evaluates model advantage not only through cross entropy during training but also by considering inference outputs, providing a more comprehensive assessment. To combine the two modes effectively, we introduce ProFuser to progressively transition from inference mode to training mode. To validate ProFuser's effectiveness, we fused three models, including vicuna-7b-v1.5, Llama-2-7b-chat, and mpt-7b-8k-chat, and demonstrated the improved performance in knowledge, reasoning, and safety compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04998v1-abstract-full').style.display = 'none'; document.getElementById('2408.04998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02559">arXiv:2408.02559</a> <span> [<a href="https://arxiv.org/pdf/2408.02559">pdf</a>, <a href="https://arxiv.org/format/2408.02559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Enhancing LLMs Agent based on Theory of Mind in Guandan: A Multi-Player Cooperative Game under Imperfect Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yim%2C+Y">Yauwai Yim</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+C">Chunkit Chan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zheye Deng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianshi Zheng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02559v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown success in handling simple games with imperfect information and enabling multi-agent coordination, but their ability to facilitate practical collaboration against other agents in complex, imperfect information environments, especially in a non-English environment, still needs to be explored. This study investigates the applicability of knowledge acquired by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02559v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02559v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown success in handling simple games with imperfect information and enabling multi-agent coordination, but their ability to facilitate practical collaboration against other agents in complex, imperfect information environments, especially in a non-English environment, still needs to be explored. This study investigates the applicability of knowledge acquired by open-source and API-based LLMs to sophisticated text-based games requiring agent collaboration under imperfect information, comparing their performance to established baselines using other types of agents. We propose a Theory of Mind (ToM) planning technique that allows LLM agents to adapt their strategy against various adversaries using only game rules, current state, and historical context as input. An external tool was incorporated to mitigate the challenge of dynamic and extensive action spaces in this card game. Our results show that although a performance gap exists between current LLMs and state-of-the-art reinforcement learning (RL) models, LLMs demonstrate ToM capabilities in this game setting. It consistently improves their performance against opposing agents, suggesting their ability to understand the actions of allies and adversaries and establish collaboration with allies. To encourage further research and understanding, we have made our codebase openly accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02559v1-abstract-full').style.display = 'none'; document.getElementById('2408.02559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19256">arXiv:2407.19256</a> <span> [<a href="https://arxiv.org/pdf/2407.19256">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Parrots or ICU Experts? Large Language Models in Critical Care Medicine: A Scoping Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tongyue Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zihan Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haowei Xu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+M">Minqi Xiong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Meirong Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yilin Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huiying Zhao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+G">Guilan Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19256v1-abstract-short" style="display: inline;"> With the rapid development of artificial intelligence (AI), large language models (LLMs) have shown strong capabilities in natural language understanding, reasoning, and generation, attracting amounts of research interest in applying LLMs to health and medicine. Critical care medicine (CCM) provides diagnosis and treatment for critically ill patients who often require intensive monitoring and inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19256v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19256v1-abstract-full" style="display: none;"> With the rapid development of artificial intelligence (AI), large language models (LLMs) have shown strong capabilities in natural language understanding, reasoning, and generation, attracting amounts of research interest in applying LLMs to health and medicine. Critical care medicine (CCM) provides diagnosis and treatment for critically ill patients who often require intensive monitoring and interventions in intensive care units (ICUs). Can LLMs be applied to CCM? Are LLMs just like stochastic parrots or ICU experts in assisting clinical decision-making? This scoping review aims to provide a panoramic portrait of the application of LLMs in CCM. Literature in seven databases, including PubMed, Embase, Scopus, Web of Science, CINAHL, IEEE Xplore, and ACM Digital Library, were searched from January 1, 2019, to June 10, 2024. Peer-reviewed journal and conference articles that discussed the application of LLMs in critical care settings were included. From an initial 619 articles, 24 were selected for final review. This review grouped applications of LLMs in CCM into three categories: clinical decision support, medical documentation and reporting, and medical education and doctor-patient communication. LLMs have advantages in handling unstructured data and do not require manual feature engineering. Meanwhile, applying LLMs to CCM faces challenges, including hallucinations, poor interpretability, bias and alignment challenges, and privacy and ethics issues. Future research should enhance model reliability and interpretability, integrate up-to-date medical knowledge, and strengthen privacy and ethical guidelines. As LLMs evolve, they could become key tools in CCM to help improve patient outcomes and optimize healthcare delivery. This study is the first review of LLMs in CCM, aiding researchers, clinicians, and policymakers to understand the current status and future potentials of LLMs in CCM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19256v1-abstract-full').style.display = 'none'; document.getElementById('2407.19256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16857">arXiv:2407.16857</a> <span> [<a href="https://arxiv.org/pdf/2407.16857">pdf</a>, <a href="https://arxiv.org/format/2407.16857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> SECRM-2D: RL-Based Efficient and Comfortable Route-Following Autonomous Driving with Analytic Safety Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Smirnov%2C+I">Ilia Smirnov</a>, <a href="/search/cs?searchtype=author&query=ElSamadisy%2C+O">Omar ElSamadisy</a>, <a href="/search/cs?searchtype=author&query=Abdulhai%2C+B">Baher Abdulhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16857v1-abstract-short" style="display: inline;"> Over the last decade, there has been increasing interest in autonomous driving systems. Reinforcement Learning (RL) shows great promise for training autonomous driving controllers, being able to directly optimize a combination of criteria such as efficiency comfort, and stability. However, RL- based controllers typically offer no safety guarantees, making their readiness for real deployment questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16857v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16857v1-abstract-full" style="display: none;"> Over the last decade, there has been increasing interest in autonomous driving systems. Reinforcement Learning (RL) shows great promise for training autonomous driving controllers, being able to directly optimize a combination of criteria such as efficiency comfort, and stability. However, RL- based controllers typically offer no safety guarantees, making their readiness for real deployment questionable. In this paper, we propose SECRM-2D (the Safe, Efficient and Comfortable RL- based driving Model with Lane-Changing), an RL autonomous driving controller (both longitudinal and lateral) that balances optimization of efficiency and comfort and follows a fixed route, while being subject to hard analytic safety constraints. The aforementioned safety constraints are derived from the criterion that the follower vehicle must have sufficient headway to be able to avoid a crash if the leader vehicle brakes suddenly. We evaluate SECRM-2D against several learning and non-learning baselines in simulated test scenarios, including freeway driving, exiting, merging, and emergency braking. Our results confirm that representative previously-published RL AV controllers may crash in both training and testing, even if they are optimizing a safety objective. By contrast, our controller SECRM-2D is successful in avoiding crashes during both training and testing, improves over the baselines in measures of efficiency and comfort, and is more faithful in following the prescribed route. In addition, we achieve a good theoretical understanding of the longitudinal steady-state of a collection of SECRM-2D vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16857v1-abstract-full').style.display = 'none'; document.getElementById('2407.16857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06083">arXiv:2407.06083</a> <span> [<a href="https://arxiv.org/pdf/2407.06083">pdf</a>, <a href="https://arxiv.org/format/2407.06083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Controllable Learning: Methods and Applications in Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chenglei Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Teng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changshuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+G">Guofu Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06083v1-abstract-short" style="display: inline;"> Controllable learning (CL) emerges as a critical component in trustworthy machine learning, ensuring that learners meet predefined targets and can adaptively adjust without retraining according to the changes in those targets. We provide a formal definition of CL, and discuss its applications in information retrieval (IR) where information needs are often complex and dynamic. The survey categorize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06083v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06083v1-abstract-full" style="display: none;"> Controllable learning (CL) emerges as a critical component in trustworthy machine learning, ensuring that learners meet predefined targets and can adaptively adjust without retraining according to the changes in those targets. We provide a formal definition of CL, and discuss its applications in information retrieval (IR) where information needs are often complex and dynamic. The survey categorizes CL according to who controls (users or platforms), what is controllable (e.g., retrieval objectives, users' historical behaviors, controllable environmental adaptation), how control is implemented (e.g., rule-based method, Pareto optimization, Hypernetwork), and where to implement control (e.g.,pre-processing, in-processing, post-processing methods). Then, we identify challenges faced by CL across training, evaluation, task setting, and deployment in online environments. Additionally, we outline promising directions for CL in theoretical analysis, efficient computation, empowering large language models, application scenarios and evaluation frameworks in IR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06083v1-abstract-full').style.display = 'none'; document.getElementById('2407.06083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03332">arXiv:2407.03332</a> <span> [<a href="https://arxiv.org/pdf/2407.03332">pdf</a>, <a href="https://arxiv.org/format/2407.03332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DDPM-MoCo: Advancing Industrial Surface Defect Generation and Detection with Generative and Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyan Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03332v1-abstract-short" style="display: inline;"> The task of industrial detection based on deep learning often involves solving two problems: (1) obtaining sufficient and effective data samples, (2) and using efficient and convenient model training methods. In this paper, we introduce a novel defect-generation method, named DDPM-MoCo, to address these issues. Firstly, we utilize the Denoising Diffusion Probabilistic Model (DDPM) to generate high… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03332v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03332v1-abstract-full" style="display: none;"> The task of industrial detection based on deep learning often involves solving two problems: (1) obtaining sufficient and effective data samples, (2) and using efficient and convenient model training methods. In this paper, we introduce a novel defect-generation method, named DDPM-MoCo, to address these issues. Firstly, we utilize the Denoising Diffusion Probabilistic Model (DDPM) to generate high-quality defect data samples, overcoming the problem of insufficient sample data for model learning. Furthermore, we utilize the unsupervised learning Momentum Contrast model (MoCo) with an enhanced batch contrastive loss function for training the model on unlabeled data, addressing the efficiency and consistency challenges in large-scale negative sample encoding during diffusion model training. The experimental results showcase an enhanced visual detection method for identifying defects on metal surfaces, covering the entire process, starting from generating unlabeled sample data for training the diffusion model, to utilizing the same labeled sample data for downstream detection tasks. This study offers valuable practical insights and application potential for visual detection in the metal processing industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03332v1-abstract-full').style.display = 'none'; document.getElementById('2407.03332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01219">arXiv:2407.01219</a> <span> [<a href="https://arxiv.org/pdf/2407.01219">pdf</a>, <a href="https://arxiv.org/format/2407.01219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Searching for Best Practices in Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaohua Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenghua Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xuan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Feiran Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixin Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhibo Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shizheng Li</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Q">Qi Qian</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Ruicheng Yin</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Changze Lv</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaoqing Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01219v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) techniques have proven to be effective in integrating up-to-date information, mitigating hallucinations, and enhancing response quality, particularly in specialized domains. While many RAG approaches have been proposed to enhance large language models through query-dependent retrievals, these approaches still suffer from their complex implementation and prolong… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01219v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01219v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) techniques have proven to be effective in integrating up-to-date information, mitigating hallucinations, and enhancing response quality, particularly in specialized domains. While many RAG approaches have been proposed to enhance large language models through query-dependent retrievals, these approaches still suffer from their complex implementation and prolonged response times. Typically, a RAG workflow involves multiple processing steps, each of which can be executed in various ways. Here, we investigate existing RAG approaches and their potential combinations to identify optimal RAG practices. Through extensive experiments, we suggest several strategies for deploying RAG that balance both performance and efficiency. Moreover, we demonstrate that multimodal retrieval techniques can significantly enhance question-answering capabilities about visual inputs and accelerate the generation of multimodal content using a "retrieval as generation" strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01219v1-abstract-full').style.display = 'none'; document.getElementById('2407.01219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17807">arXiv:2406.17807</a> <span> [<a href="https://arxiv.org/pdf/2406.17807">pdf</a>, <a href="https://arxiv.org/format/2406.17807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Commentary Strategies for Imperfect Information Card Games: A Study of Large Language Models in Guandan Commentary </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yiling Tao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17807v3-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have unlocked the potential for generating high-quality game commentary. However, producing insightful and engaging commentary for complex games with incomplete information remains a significant challenge. In this paper, we introduce a novel commentary method that combine Reinforcement Learning (RL) and LLMs, tailored specifically for the Chinese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17807v3-abstract-full').style.display = 'inline'; document.getElementById('2406.17807v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17807v3-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have unlocked the potential for generating high-quality game commentary. However, producing insightful and engaging commentary for complex games with incomplete information remains a significant challenge. In this paper, we introduce a novel commentary method that combine Reinforcement Learning (RL) and LLMs, tailored specifically for the Chinese card game \textit{Guandan}. Our system leverages RL to generate intricate card-playing scenarios and employs LLMs to generate corresponding commentary text, effectively emulating the strategic analysis and narrative prowess of professional commentators. The framework comprises a state commentary guide, a Theory of Mind (ToM)-based strategy analyzer, and a style retrieval module, which seamlessly collaborate to deliver detailed and context-relevant game commentary in the Chinese language environment. We empower LLMs with ToM capabilities and refine both retrieval and information filtering mechanisms. This facilitates the generation of personalized commentary content. Our experimental results showcase the substantial enhancement in performance achieved by the proposed commentary framework when applied to open-source LLMs, surpassing the performance of GPT-4 across multiple evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17807v3-abstract-full').style.display = 'none'; document.getElementById('2406.17807v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16942">arXiv:2406.16942</a> <span> [<a href="https://arxiv.org/pdf/2406.16942">pdf</a>, <a href="https://arxiv.org/format/2406.16942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Diagnostic Reliability of Foundation Model with Uncertainty Estimation in OCT Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yuanyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+A">Aidi Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tian Lin</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yinglin Cheng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tingkun Shi</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+X">Xulong Liao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lixia Feng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhen Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinjian Chen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Huazhu Fu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16942v1-abstract-short" style="display: inline;"> Inability to express the confidence level and detect unseen classes has limited the clinical implementation of artificial intelligence in the real-world. We developed a foundation model with uncertainty estimation (FMUE) to detect 11 retinal conditions on optical coherence tomography (OCT). In the internal test set, FMUE achieved a higher F1 score of 96.76% than two state-of-the-art algorithms, RE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16942v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16942v1-abstract-full" style="display: none;"> Inability to express the confidence level and detect unseen classes has limited the clinical implementation of artificial intelligence in the real-world. We developed a foundation model with uncertainty estimation (FMUE) to detect 11 retinal conditions on optical coherence tomography (OCT). In the internal test set, FMUE achieved a higher F1 score of 96.76% than two state-of-the-art algorithms, RETFound and UIOS, and got further improvement with thresholding strategy to 98.44%. In the external test sets obtained from other OCT devices, FMUE achieved an accuracy of 88.75% and 92.73% before and after thresholding. Our model is superior to two ophthalmologists with a higher F1 score (95.17% vs. 61.93% &71.72%). Besides, our model correctly predicts high uncertainty scores for samples with ambiguous features, of non-target-category diseases, or with low-quality to prompt manual checks and prevent misdiagnosis. FMUE provides a trustworthy method for automatic retinal anomalies detection in the real-world clinical open set environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16942v1-abstract-full').style.display = 'none'; document.getElementById('2406.16942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">All codes are available at https://github.com/yuanyuanpeng0129/FMUE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16062">arXiv:2406.16062</a> <span> [<a href="https://arxiv.org/pdf/2406.16062">pdf</a>, <a href="https://arxiv.org/format/2406.16062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Towards Biologically Plausible Computing: A Comprehensive Comparison </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+C">Changze Lv</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yufei Gu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhengkang Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhibo Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixin Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Feiran Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenghua Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Ruicheng Yin</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yu Shang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Siqi Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaohua Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Muling Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenhao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianlong Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cenyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zixuan Ling</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaoqing Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16062v1-abstract-short" style="display: inline;"> Backpropagation is a cornerstone algorithm in training neural networks for supervised learning, which uses a gradient descent method to update network weights by minimizing the discrepancy between actual and desired outputs. Despite its pivotal role in propelling deep learning advancements, the biological plausibility of backpropagation is questioned due to its requirements for weight symmetry, gl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16062v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16062v1-abstract-full" style="display: none;"> Backpropagation is a cornerstone algorithm in training neural networks for supervised learning, which uses a gradient descent method to update network weights by minimizing the discrepancy between actual and desired outputs. Despite its pivotal role in propelling deep learning advancements, the biological plausibility of backpropagation is questioned due to its requirements for weight symmetry, global error computation, and dual-phase training. To address this long-standing challenge, many studies have endeavored to devise biologically plausible training algorithms. However, a fully biologically plausible algorithm for training multilayer neural networks remains elusive, and interpretations of biological plausibility vary among researchers. In this study, we establish criteria for biological plausibility that a desirable learning algorithm should meet. Using these criteria, we evaluate a range of existing algorithms considered to be biologically plausible, including Hebbian learning, spike-timing-dependent plasticity, feedback alignment, target propagation, predictive coding, forward-forward algorithm, perturbation learning, local losses, and energy-based learning. Additionally, we empirically evaluate these algorithms across diverse network architectures and datasets. We compare the feature representations learned by these algorithms with brain activity recorded by non-invasive devices under identical stimuli, aiming to identify which algorithm can most accurately replicate brain activity patterns. We are hopeful that this study could inspire the development of new biologically plausible algorithms for training multilayer networks, thereby fostering progress in both the fields of neuroscience and machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16062v1-abstract-full').style.display = 'none'; document.getElementById('2406.16062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09317">arXiv:2406.09317</a> <span> [<a href="https://arxiv.org/pdf/2406.09317">pdf</a>, <a href="https://arxiv.org/format/2406.09317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Common and Rare Fundus Diseases Identification Using Vision-Language Foundation Model with Knowledge of Over 400 Diseases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tian Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+A">Aidi Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kai Yu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yuanyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lianyu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Huiyu Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Man Chen</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xue Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meiqin Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Binwei Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chaoxin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yilong Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Honghe Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tingkun Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinming Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaolin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingcheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tham%2C+Y+C">Yih Chung Tham</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09317v2-abstract-short" style="display: inline;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09317v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09317v2-abstract-full" style="display: none;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources, encompassing a diverse range of diseases across multiple ethnicities and countries. RetiZero exhibits superior performance in several downstream tasks, including zero-shot disease recognition, image-to-image retrieval, and internal- and cross-domain disease identification. In zero-shot scenarios, RetiZero achieves Top5 accuracy scores of 0.8430 for 15 fundus diseases and 0.7561 for 52 fundus diseases. For image retrieval, it achieves Top5 scores of 0.9500 and 0.8860 for the same disease sets, respectively. Clinical evaluations show that RetiZero's Top3 zero-shot performance surpasses the average of 19 ophthalmologists from Singapore, China and the United States. Furthermore, RetiZero significantly enhances clinicians' accuracy in diagnosing fundus disease. These findings underscore the value of integrating the RetiZero foundation model into clinical settings, where a variety of fundus diseases are encountered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'none'; document.getElementById('2406.09317v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07590">arXiv:2406.07590</a> <span> [<a href="https://arxiv.org/pdf/2406.07590">pdf</a>, <a href="https://arxiv.org/format/2406.07590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StreamPrompt: Learnable Prompt-guided Data Selection for Efficient Stream Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tongjun Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuhao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07590v1-abstract-short" style="display: inline;"> Stream Learning (SL) requires models to rapidly adapt to continuous data streams, setting it apart from traditional Continual Learning (CL). Recent SL methods emphasize efficiency by selecting data subsets for training, but they often struggle due to their reliance on static, rule-based selection algorithms that cannot effectively adapt to the changing importance of data. In this work, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07590v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07590v1-abstract-full" style="display: none;"> Stream Learning (SL) requires models to rapidly adapt to continuous data streams, setting it apart from traditional Continual Learning (CL). Recent SL methods emphasize efficiency by selecting data subsets for training, but they often struggle due to their reliance on static, rule-based selection algorithms that cannot effectively adapt to the changing importance of data. In this work, we introduce StreamPrompt, a method that enhances data selection through dynamic, learnable prompts. These dynamic prompts serve two purposes beyond guiding model inference: 1) optimizing data selection, and 2) guiding updates to the rehearsal buffer. This approach addresses the challenges of adaptability and computational efficiency in processing continuous data streams. Moreover, StreamPrompt introduces Prompt Attunement,a mechanism that enhances the efficiency of prompt learning. By leveraging attention layers from vision transformers and softly combining their outputs with a gate unit, Prompt Attunementrefines prompts with minimal computational resources. Comprehensive evaluations demonstrate StreamPrompts superior performance over state-of-the-art, with significant improvements in accuracy and reductions in training time. These results underscore the efficacy and efficiency of StreamPrompt, establishing its potential as a scalable and effective solution for the evolving demands of SL. Our code is available at https://github.com/intellistream/Efficient-Stream-Learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07590v1-abstract-full').style.display = 'none'; document.getElementById('2406.07590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05628">arXiv:2406.05628</a> <span> [<a href="https://arxiv.org/pdf/2406.05628">pdf</a>, <a href="https://arxiv.org/format/2406.05628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domain Generalization Guided by Large-Scale Pre-Trained Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongbin Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Bin Pan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyang Shi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05628v1-abstract-short" style="display: inline;"> Domain generalization (DG) aims to train a model from limited source domains, allowing it to generalize to unknown target domains. Typically, DG models only employ large-scale pre-trained models during the initialization of fine-tuning. However, large-scale pre-trained models already possess the ability to resist domain shift. If we reference pre-trained models continuously during fine-tuning to m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05628v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05628v1-abstract-full" style="display: none;"> Domain generalization (DG) aims to train a model from limited source domains, allowing it to generalize to unknown target domains. Typically, DG models only employ large-scale pre-trained models during the initialization of fine-tuning. However, large-scale pre-trained models already possess the ability to resist domain shift. If we reference pre-trained models continuously during fine-tuning to maintain this ability, it could further enhance the generalization ability of the DG model. For this purpose, we introduce a new method called Fine-Tune with Large-scale pre-trained Priors (FT-LP), which incorporates the pre-trained model as a prior into the DG fine-tuning process, ensuring that the model refers to its pre-trained model at each optimization step. FT-LP comprises a theoretical framework and a simple implementation strategy. In theory, we verify the rationality of FT-LP by introducing a generalization error bound with the pre-trained priors for DG. In implementation, we utilize an encoder to simulate the model distribution, enabling the use of FT-LP when only pre-trained weights are available. In summary, we offer a new fine-tuning method for DG algorithms to utilize pre-trained models throughout the fine-tuning process. Through experiments on various datasets and DG models, our proposed method exhibits significant improvements, indicating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05628v1-abstract-full').style.display = 'none'; document.getElementById('2406.05628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04828">arXiv:2406.04828</a> <span> [<a href="https://arxiv.org/pdf/2406.04828">pdf</a>, <a href="https://arxiv.org/format/2406.04828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> QAGCF: Graph Collaborative Filtering for Q&A Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changshuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Teng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yanping Zheng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04828v1-abstract-short" style="display: inline;"> Question and answer (Q&A) platforms usually recommend question-answer pairs to meet users' knowledge acquisition needs, unlike traditional recommendations that recommend only one item. This makes user behaviors more complex, and presents two challenges for Q&A recommendation, including: the collaborative information entanglement, which means user feedback is influenced by either the question or th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04828v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04828v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04828v1-abstract-full" style="display: none;"> Question and answer (Q&A) platforms usually recommend question-answer pairs to meet users' knowledge acquisition needs, unlike traditional recommendations that recommend only one item. This makes user behaviors more complex, and presents two challenges for Q&A recommendation, including: the collaborative information entanglement, which means user feedback is influenced by either the question or the answer; and the semantic information entanglement, where questions are correlated with their corresponding answers, and correlations also exist among different question-answer pairs. Traditional recommendation methods treat the question-answer pair as a whole or only consider the answer as a single item, which overlooks the two challenges and cannot effectively model user interests. To address these challenges, we introduce Question & Answer Graph Collaborative Filtering (QAGCF), a graph neural network model that creates separate graphs for collaborative and semantic views to disentangle the information in question-answer pairs. The collaborative view disentangles questions and answers to individually model collaborative information, while the semantic view captures the semantic information both within and between question-answer pairs. These views are further merged into a global graph to integrate the collaborative and semantic information. Polynomial-based graph filters are used to address the high heterophily issues of the global graph. Additionally, contrastive learning is utilized to obtain robust embeddings during training. Extensive experiments on industrial and public datasets demonstrate that QAGCF consistently outperforms baselines and achieves state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04828v1-abstract-full').style.display = 'none'; document.getElementById('2406.04828v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04712">arXiv:2406.04712</a> <span> [<a href="https://arxiv.org/pdf/2406.04712">pdf</a>, <a href="https://arxiv.org/format/2406.04712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AICoderEval: Improving AI Domain Code Generation of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuyan Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinsong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04712v1-abstract-short" style="display: inline;"> Automated code generation is a pivotal capability of large language models (LLMs). However, assessing this capability in real-world scenarios remains challenging. Previous methods focus more on low-level code generation, such as model loading, instead of generating high-level codes catering for real-world tasks, such as image-to-text, text classification, in various domains. Therefore, we construc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04712v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04712v1-abstract-full" style="display: none;"> Automated code generation is a pivotal capability of large language models (LLMs). However, assessing this capability in real-world scenarios remains challenging. Previous methods focus more on low-level code generation, such as model loading, instead of generating high-level codes catering for real-world tasks, such as image-to-text, text classification, in various domains. Therefore, we construct AICoderEval, a dataset focused on real-world tasks in various domains based on HuggingFace, PyTorch, and TensorFlow, along with comprehensive metrics for evaluation and enhancing LLMs' task-specific code generation capability. AICoderEval contains test cases and complete programs for automated evaluation of these tasks, covering domains such as natural language processing, computer vision, and multimodal learning. To facilitate research in this area, we open-source the AICoderEval dataset at \url{https://huggingface.co/datasets/vixuowis/AICoderEval}. After that, we propose CoderGen, an agent-based framework, to help LLMs generate codes related to real-world tasks on the constructed AICoderEval. Moreover, we train a more powerful task-specific code generation model, named AICoder, which is refined on llama-3 based on AICoderEval. Our experiments demonstrate the effectiveness of CoderGen in improving LLMs' task-specific code generation capability (by 12.00\% on pass@1 for original model and 9.50\% on pass@1 for ReAct Agent). AICoder also outperforms current code generation LLMs, indicating the great quality of the AICoderEval benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04712v1-abstract-full').style.display = 'none'; document.getElementById('2406.04712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16847">arXiv:2405.16847</a> <span> [<a href="https://arxiv.org/pdf/2405.16847">pdf</a>, <a href="https://arxiv.org/format/2405.16847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TokenUnify: Scalable Autoregressive Visual Pre-training with Mixture Token Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinda Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haoyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Te Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruobing Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhiwei Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16847v1-abstract-short" style="display: inline;"> Autoregressive next-token prediction is a standard pretraining method for large-scale language models, but its application to vision tasks is hindered by the non-sequential nature of image data, leading to cumulative errors. Most vision models employ masked autoencoder (MAE) based pretraining, which faces scalability issues. To address these challenges, we introduce \textbf{TokenUnify}, a novel pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16847v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16847v1-abstract-full" style="display: none;"> Autoregressive next-token prediction is a standard pretraining method for large-scale language models, but its application to vision tasks is hindered by the non-sequential nature of image data, leading to cumulative errors. Most vision models employ masked autoencoder (MAE) based pretraining, which faces scalability issues. To address these challenges, we introduce \textbf{TokenUnify}, a novel pretraining method that integrates random token prediction, next-token prediction, and next-all token prediction. We provide theoretical evidence demonstrating that TokenUnify mitigates cumulative errors in visual autoregression. Cooperated with TokenUnify, we have assembled a large-scale electron microscopy (EM) image dataset with ultra-high resolution, ideal for creating spatially correlated long sequences. This dataset includes over 120 million annotated voxels, making it the largest neuron segmentation dataset to date and providing a unified benchmark for experimental validation. Leveraging the Mamba network inherently suited for long-sequence modeling on this dataset, TokenUnify not only reduces the computational complexity but also leads to a significant 45\% improvement in segmentation performance on downstream EM neuron segmentation tasks compared to existing methods. Furthermore, TokenUnify demonstrates superior scalability over MAE and traditional autoregressive methods, effectively bridging the gap between pretraining strategies for language and vision models. Code is available at \url{https://github.com/ydchen0806/TokenUnify}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16847v1-abstract-full').style.display = 'none'; document.getElementById('2405.16847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16701">arXiv:2405.16701</a> <span> [<a href="https://arxiv.org/pdf/2405.16701">pdf</a>, <a href="https://arxiv.org/format/2405.16701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detail-Enhanced Intra- and Inter-modal Interaction for Audio-Visual Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tong Shi</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+X">Xuri Ge</a>, <a href="/search/cs?searchtype=author&query=Jose%2C+J+M">Joemon M. Jose</a>, <a href="/search/cs?searchtype=author&query=Pugeault%2C+N">Nicolas Pugeault</a>, <a href="/search/cs?searchtype=author&query=Henderson%2C+P">Paul Henderson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16701v1-abstract-short" style="display: inline;"> Capturing complex temporal relationships between video and audio modalities is vital for Audio-Visual Emotion Recognition (AVER). However, existing methods lack attention to local details, such as facial state changes between video frames, which can reduce the discriminability of features and thus lower recognition accuracy. In this paper, we propose a Detail-Enhanced Intra- and Inter-modal Intera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16701v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16701v1-abstract-full" style="display: none;"> Capturing complex temporal relationships between video and audio modalities is vital for Audio-Visual Emotion Recognition (AVER). However, existing methods lack attention to local details, such as facial state changes between video frames, which can reduce the discriminability of features and thus lower recognition accuracy. In this paper, we propose a Detail-Enhanced Intra- and Inter-modal Interaction network(DE-III) for AVER, incorporating several novel aspects. We introduce optical flow information to enrich video representations with texture details that better capture facial state changes. A fusion module integrates the optical flow estimation with the corresponding video frames to enhance the representation of facial texture variations. We also design attentive intra- and inter-modal feature enhancement modules to further improve the richness and discriminability of video and audio representations. A detailed quantitative evaluation shows that our proposed model outperforms all existing methods on three benchmark datasets for both concrete and continuous emotion recognition. To encourage further research and ensure replicability, we will release our full code upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16701v1-abstract-full').style.display = 'none'; document.getElementById('2405.16701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to 27th International Conference of Pattern Recognition (ICPR 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04135">arXiv:2405.04135</a> <span> [<a href="https://arxiv.org/pdf/2405.04135">pdf</a>, <a href="https://arxiv.org/format/2405.04135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> In-context Learning for Automated Driving Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyue Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Khamis%2C+A">Alaa Khamis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04135v1-abstract-short" style="display: inline;"> One of the key challenges in current Reinforcement Learning (RL)-based Automated Driving (AD) agents is achieving flexible, precise, and human-like behavior cost-effectively. This paper introduces an innovative approach utilizing Large Language Models (LLMs) to intuitively and effectively optimize RL reward functions in a human-centric way. We developed a framework where instructions and dynamic e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04135v1-abstract-full').style.display = 'inline'; document.getElementById('2405.04135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04135v1-abstract-full" style="display: none;"> One of the key challenges in current Reinforcement Learning (RL)-based Automated Driving (AD) agents is achieving flexible, precise, and human-like behavior cost-effectively. This paper introduces an innovative approach utilizing Large Language Models (LLMs) to intuitively and effectively optimize RL reward functions in a human-centric way. We developed a framework where instructions and dynamic environment descriptions are input into the LLM. The LLM then utilizes this information to assist in generating rewards, thereby steering the behavior of RL agents towards patterns that more closely resemble human driving. The experimental results demonstrate that this approach not only makes RL agents more anthropomorphic but also reaches better performance. Additionally, various strategies for reward-proxy and reward-shaping are investigated, revealing the significant impact of prompt design on shaping an AD vehicle's behavior. These findings offer a promising direction for the development of more advanced and human-like automated driving systems. Our experimental data and source code can be found here. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04135v1-abstract-full').style.display = 'none'; document.getElementById('2405.04135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures, 35 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02289">arXiv:2405.02289</a> <span> [<a href="https://arxiv.org/pdf/2405.02289">pdf</a>, <a href="https://arxiv.org/format/2405.02289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TSDiT: Traffic Scene Diffusion Models With Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chen Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02289v1-abstract-short" style="display: inline;"> In this paper, we introduce a novel approach to trajectory generation for autonomous driving, combining the strengths of Diffusion models and Transformers. First, we use the historical trajectory data for efficient preprocessing and generate action latent using a diffusion model with DiT(Diffusion with Transformers) Blocks to increase scene diversity and stochasticity of agent actions. Then, we co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02289v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02289v1-abstract-full" style="display: none;"> In this paper, we introduce a novel approach to trajectory generation for autonomous driving, combining the strengths of Diffusion models and Transformers. First, we use the historical trajectory data for efficient preprocessing and generate action latent using a diffusion model with DiT(Diffusion with Transformers) Blocks to increase scene diversity and stochasticity of agent actions. Then, we combine action latent, historical trajectories and HD Map features and put them into different transformer blocks. Finally, we use a trajectory decoder to generate future trajectories of agents in the traffic scene. The method exhibits superior performance in generating smooth turning trajectories, enhancing the model's capability to fit complex steering patterns. The experimental results demonstrate the effectiveness of our method in producing realistic and diverse trajectories, showcasing its potential for application in autonomous vehicle navigation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02289v1-abstract-full').style.display = 'none'; document.getElementById('2405.02289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01063">arXiv:2405.01063</a> <span> [<a href="https://arxiv.org/pdf/2405.01063">pdf</a>, <a href="https://arxiv.org/format/2405.01063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fair Recommendations with Limited Sensitive Attributes: A Distributionally Robust Optimization Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianhao Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01063v2-abstract-short" style="display: inline;"> As recommender systems are indispensable in various domains such as job searching and e-commerce, providing equitable recommendations to users with different sensitive attributes becomes an imperative requirement. Prior approaches for enhancing fairness in recommender systems presume the availability of all sensitive attributes, which can be difficult to obtain due to privacy concerns or inadequat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01063v2-abstract-full').style.display = 'inline'; document.getElementById('2405.01063v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01063v2-abstract-full" style="display: none;"> As recommender systems are indispensable in various domains such as job searching and e-commerce, providing equitable recommendations to users with different sensitive attributes becomes an imperative requirement. Prior approaches for enhancing fairness in recommender systems presume the availability of all sensitive attributes, which can be difficult to obtain due to privacy concerns or inadequate means of capturing these attributes. In practice, the efficacy of these approaches is limited, pushing us to investigate ways of promoting fairness with limited sensitive attribute information. Toward this goal, it is important to reconstruct missing sensitive attributes. Nevertheless, reconstruction errors are inevitable due to the complexity of real-world sensitive attribute reconstruction problems and legal regulations. Thus, we pursue fair learning methods that are robust to reconstruction errors. To this end, we propose Distributionally Robust Fair Optimization (DRFO), which minimizes the worst-case unfairness over all potential probability distributions of missing sensitive attributes instead of the reconstructed one to account for the impact of the reconstruction errors. We provide theoretical and empirical evidence to demonstrate that our method can effectively ensure fairness in recommender systems when only limited sensitive attributes are accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01063v2-abstract-full').style.display = 'none'; document.getElementById('2405.01063v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, accepted by SIGIR'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12529">arXiv:2404.12529</a> <span> [<a href="https://arxiv.org/pdf/2404.12529">pdf</a>, <a href="https://arxiv.org/format/2404.12529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Bluetooth Indoor Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Taolei Shi</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+W">Wei Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12529v1-abstract-short" style="display: inline;"> Nowadays, indoor localization has received extensive research interest due to more and more applications' needs for location information to provide a more precise and effective service [1], [2]. There are various wireless techniques and mechanisms that have been proposed; some of them have been studied in depth and come into use, such as Wi-Fi, RFID, and sensor networks. In comparison, the develop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12529v1-abstract-full').style.display = 'inline'; document.getElementById('2404.12529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12529v1-abstract-full" style="display: none;"> Nowadays, indoor localization has received extensive research interest due to more and more applications' needs for location information to provide a more precise and effective service [1], [2]. There are various wireless techniques and mechanisms that have been proposed; some of them have been studied in depth and come into use, such as Wi-Fi, RFID, and sensor networks. In comparison, the development of Bluetooth location technology is slow and there are not many papers and surveys in this field, although the performance and market value of Bluetooth are increasing steadily. In this paper, we aim to provide a detailed survey of various indoor localization systems with Bluetooth. In contrast with the existing surveys, we categorize the exciting localization techniques that have been proposed in the literature in order to sketch the development of Bluetooth location compared to other technologies. We also evaluate different systems from the perspective of availability, cost, scalability, and accuracy. We also discuss remaining problems and challenges to accurate Bluetooth localization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12529v1-abstract-full').style.display = 'none'; document.getElementById('2404.12529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09520">arXiv:2404.09520</a> <span> [<a href="https://arxiv.org/pdf/2404.09520">pdf</a>, <a href="https://arxiv.org/format/2404.09520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> UniSAR: Modeling User Transition Behaviors between Search and Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Teng Shi</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Z">Zihua Si</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+X">Xiaoxue Zang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kai Zheng</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+D">Dewei Leng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yanan Niu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09520v1-abstract-short" style="display: inline;"> Nowadays, many platforms provide users with both search and recommendation services as important tools for accessing information. The phenomenon has led to a correlation between user search and recommendation behaviors, providing an opportunity to model user interests in a fine-grained way. Existing approaches either model user search and recommendation behaviors separately or overlook the differe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09520v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09520v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09520v1-abstract-full" style="display: none;"> Nowadays, many platforms provide users with both search and recommendation services as important tools for accessing information. The phenomenon has led to a correlation between user search and recommendation behaviors, providing an opportunity to model user interests in a fine-grained way. Existing approaches either model user search and recommendation behaviors separately or overlook the different transitions between user search and recommendation behaviors. In this paper, we propose a framework named UniSAR that effectively models the different types of fine-grained behavior transitions for providing users a Unified Search And Recommendation service. Specifically, UniSAR models the user transition behaviors between search and recommendation through three steps: extraction, alignment, and fusion, which are respectively implemented by transformers equipped with pre-defined masks, contrastive learning that aligns the extracted fine-grained user transitions, and cross-attentions that fuse different transitions. To provide users with a unified service, the learned representations are fed into the downstream search and recommendation models. Joint learning on both search and recommendation data is employed to utilize the knowledge and enhance each other. Experimental results on two public datasets demonstrated the effectiveness of UniSAR in terms of enhancing both search and recommendation simultaneously. The experimental analysis further validates that UniSAR enhances the results by successfully modeling the user transition behaviors between search and recommendation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09520v1-abstract-full').style.display = 'none'; document.getElementById('2404.09520v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGIR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08226">arXiv:2404.08226</a> <span> [<a href="https://arxiv.org/pdf/2404.08226">pdf</a>, <a href="https://arxiv.org/format/2404.08226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving Continuous Sign Language Recognition with Adapted Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lianyu Hu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tongkai Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Liqing Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zekang Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08226v1-abstract-short" style="display: inline;"> The increase of web-scale weakly labelled image-text pairs have greatly facilitated the development of large-scale vision-language models (e.g., CLIP), which have shown impressive generalization performance over a series of downstream tasks. However, the massive model size and scarcity of available data limit their applications to fine-tune the whole model in downstream tasks. Besides, fully fine-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08226v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08226v1-abstract-full" style="display: none;"> The increase of web-scale weakly labelled image-text pairs have greatly facilitated the development of large-scale vision-language models (e.g., CLIP), which have shown impressive generalization performance over a series of downstream tasks. However, the massive model size and scarcity of available data limit their applications to fine-tune the whole model in downstream tasks. Besides, fully fine-tuning the model easily forgets the generic essential knowledge acquired in the pretraining stage and overfits the downstream data. To enable high efficiency when adapting these large vision-language models (e.g., CLIP) to performing continuous sign language recognition (CSLR) while preserving their generalizability, we propose a novel strategy (AdaptSign). Especially, CLIP is adopted as the visual backbone to extract frame-wise features whose parameters are fixed, and a set of learnable modules are introduced to model spatial sign variations or capture temporal sign movements. The introduced additional modules are quite lightweight, only owning 3.2% extra computations with high efficiency. The generic knowledge acquired in the pretraining stage is well-preserved in the frozen CLIP backbone in this process. Extensive experiments show that despite being efficient, AdaptSign is able to demonstrate superior performance across a series of CSLR benchmarks including PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing methods. Visualizations show that AdaptSign could learn to dynamically pay major attention to the informative spatial regions and cross-frame trajectories in sign videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08226v1-abstract-full').style.display = 'none'; document.getElementById('2404.08226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05680">arXiv:2404.05680</a> <span> [<a href="https://arxiv.org/pdf/2404.05680">pdf</a>, <a href="https://arxiv.org/format/2404.05680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Heyuan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Ce Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianhao Shi</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yuda Qiu</a>, <a href="/search/cs?searchtype=author&query=An%2C+S">Sizhe An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanying Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoguang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05680v2-abstract-short" style="display: inline;"> While recent advances in 3D-aware Generative Adversarial Networks (GANs) have aided the development of near-frontal view human face synthesis, the challenge of comprehensively synthesizing a full 3D head viewable from all angles still persists. Although PanoHead proves the possibilities of using a large-scale dataset with images of both frontal and back views for full-head synthesis, it often caus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05680v2-abstract-full').style.display = 'inline'; document.getElementById('2404.05680v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05680v2-abstract-full" style="display: none;"> While recent advances in 3D-aware Generative Adversarial Networks (GANs) have aided the development of near-frontal view human face synthesis, the challenge of comprehensively synthesizing a full 3D head viewable from all angles still persists. Although PanoHead proves the possibilities of using a large-scale dataset with images of both frontal and back views for full-head synthesis, it often causes artifacts for back views. Based on our in-depth analysis, we found the reasons are mainly twofold. First, from network architecture perspective, we found each plane in the utilized tri-plane/tri-grid representation space tends to confuse the features from both sides, causing "mirroring" artifacts (e.g., the glasses appear in the back). Second, from data supervision aspect, we found that existing discriminator training in 3D GANs mainly focuses on the quality of the rendered image itself, and does not care much about its plausibility with the perspective from which it was rendered. This makes it possible to generate "face" in non-frontal views, due to its easiness to fool the discriminator. In response, we propose SphereHead, a novel tri-plane representation in the spherical coordinate system that fits the human head's geometric characteristics and efficiently mitigates many of the generated artifacts. We further introduce a view-image consistency loss for the discriminator to emphasize the correspondence of the camera parameters and the images. The combination of these efforts results in visually superior outcomes with significantly fewer artifacts. Our code and dataset are publicly available at https://lhyfst.github.io/spherehead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05680v2-abstract-full').style.display = 'none'; document.getElementById('2404.05680v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024. Project page: https://lhyfst.github.io/spherehead</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02082">arXiv:2404.02082</a> <span> [<a href="https://arxiv.org/pdf/2404.02082">pdf</a>, <a href="https://arxiv.org/format/2404.02082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WcDT: World-centric Diffusion Transformer for Traffic Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chen Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+A+X">Aaron Xuxiang Tian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Heydarian%2C+A">Arsalan Heydarian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02082v3-abstract-short" style="display: inline;"> In this paper, we introduce a novel approach for autonomous driving trajectory generation by harnessing the complementary strengths of diffusion probabilistic models (a.k.a., diffusion models) and transformers. Our proposed framework, termed the "World-Centric Diffusion Transformer"(WcDT), optimizes the entire trajectory generation process, from feature extraction to model inference. To enhance th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02082v3-abstract-full').style.display = 'inline'; document.getElementById('2404.02082v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02082v3-abstract-full" style="display: none;"> In this paper, we introduce a novel approach for autonomous driving trajectory generation by harnessing the complementary strengths of diffusion probabilistic models (a.k.a., diffusion models) and transformers. Our proposed framework, termed the "World-Centric Diffusion Transformer"(WcDT), optimizes the entire trajectory generation process, from feature extraction to model inference. To enhance the scene diversity and stochasticity, the historical trajectory data is first preprocessed into "Agent Move Statement" and encoded into latent space using Denoising Diffusion Probabilistic Models (DDPM) enhanced with Diffusion with Transformer (DiT) blocks. Then, the latent features, historical trajectories, HD map features, and historical traffic signal information are fused with various transformer-based encoders that are used to enhance the interaction of agents with other elements in the traffic scene. The encoded traffic scenes are then decoded by a trajectory decoder to generate multimodal future trajectories. Comprehensive experimental results show that the proposed approach exhibits superior performance in generating both realistic and diverse trajectories, showing its potential for integration into automatic driving simulation systems. Our code is available at \url{https://github.com/yangchen1997/WcDT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02082v3-abstract-full').style.display = 'none'; document.getElementById('2404.02082v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01663">arXiv:2404.01663</a> <span> [<a href="https://arxiv.org/pdf/2404.01663">pdf</a>, <a href="https://arxiv.org/format/2404.01663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CMAT: A Multi-Agent Collaboration Tuning Framework for Enhancing Small Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">JingSong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01663v4-abstract-short" style="display: inline;"> Open large language models (LLMs) have significantly advanced the field of natural language processing, showcasing impressive performance across various tasks.Despite the significant advancements in LLMs, their effective operation still relies heavily on human input to accurately guide the dialogue flow, with agent tuning being a crucial optimization technique that involves human adjustments to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01663v4-abstract-full').style.display = 'inline'; document.getElementById('2404.01663v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01663v4-abstract-full" style="display: none;"> Open large language models (LLMs) have significantly advanced the field of natural language processing, showcasing impressive performance across various tasks.Despite the significant advancements in LLMs, their effective operation still relies heavily on human input to accurately guide the dialogue flow, with agent tuning being a crucial optimization technique that involves human adjustments to the model for better response to such guidance.Addressing this dependency, our work introduces the TinyAgent model, trained on a meticulously curated high-quality dataset. We also present the Collaborative Multi-Agent Tuning (CMAT) framework, an innovative system designed to augment language agent capabilities through adaptive weight updates based on environmental feedback. This framework fosters collaborative learning and real-time adaptation among multiple intelligent agents, enhancing their context-awareness and long-term memory. In this research, we propose a new communication agent framework that integrates multi-agent systems with environmental feedback mechanisms, offering a scalable method to explore cooperative behaviors. Notably, our TinyAgent-7B model exhibits performance on par with GPT-3.5, despite having fewer parameters, signifying a substantial improvement in the efficiency and effectiveness of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01663v4-abstract-full').style.display = 'none'; document.getElementById('2404.01663v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11725">arXiv:2402.11725</a> <span> [<a href="https://arxiv.org/pdf/2402.11725">pdf</a>, <a href="https://arxiv.org/format/2402.11725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> How Susceptible are Large Language Models to Ideological Manipulation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zihao He</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jun Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Taiwei Shi</a>, <a href="/search/cs?searchtype=author&query=Lerman%2C+K">Kristina Lerman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11725v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) possess the potential to exert substantial influence on public perceptions and interactions with information. This raises concerns about the societal impact that could arise if the ideologies within these models can be easily manipulated. In this work, we investigate how effectively LLMs can learn and generalize ideological biases from their instruction-tuning data. Ou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11725v3-abstract-full').style.display = 'inline'; document.getElementById('2402.11725v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11725v3-abstract-full" style="display: none;"> Large Language Models (LLMs) possess the potential to exert substantial influence on public perceptions and interactions with information. This raises concerns about the societal impact that could arise if the ideologies within these models can be easily manipulated. In this work, we investigate how effectively LLMs can learn and generalize ideological biases from their instruction-tuning data. Our findings reveal a concerning vulnerability: exposure to only a small amount of ideologically driven samples significantly alters the ideology of LLMs. Notably, LLMs demonstrate a startling ability to absorb ideology from one topic and generalize it to even unrelated ones. The ease with which LLMs' ideologies can be skewed underscores the risks associated with intentionally poisoned training data by malicious actors or inadvertently introduced biases by data annotators. It also emphasizes the imperative for robust safeguards to mitigate the influence of ideological manipulations on LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11725v3-abstract-full').style.display = 'none'; document.getElementById('2402.11725v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16501">arXiv:2401.16501</a> <span> [<a href="https://arxiv.org/pdf/2401.16501">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AFSD-Physics: Exploring the governing equations of temperature evolution during additive friction stir deposition by a human-AI teaming approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tony Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mason Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajie Wu</a>, <a href="/search/cs?searchtype=author&query=Post%2C+C">Chase Post</a>, <a href="/search/cs?searchtype=author&query=Charles%2C+E">Elijah Charles</a>, <a href="/search/cs?searchtype=author&query=Schmitz%2C+T">Tony Schmitz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16501v1-abstract-short" style="display: inline;"> This paper presents a modeling effort to explore the underlying physics of temperature evolution during additive friction stir deposition (AFSD) by a human-AI teaming approach. AFSD is an emerging solid-state additive manufacturing technology that deposits materials without melting. However, both process modeling and modeling of the AFSD tool are at an early stage. In this paper, a human-AI teamin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16501v1-abstract-full').style.display = 'inline'; document.getElementById('2401.16501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16501v1-abstract-full" style="display: none;"> This paper presents a modeling effort to explore the underlying physics of temperature evolution during additive friction stir deposition (AFSD) by a human-AI teaming approach. AFSD is an emerging solid-state additive manufacturing technology that deposits materials without melting. However, both process modeling and modeling of the AFSD tool are at an early stage. In this paper, a human-AI teaming approach is proposed to combine models based on first principles with AI. The resulting human-informed machine learning method, denoted as AFSD-Physics, can effectively learn the governing equations of temperature evolution at the tool and the build from in-process measurements. Experiments are designed and conducted to collect in-process measurements for the deposition of aluminum 7075 with a total of 30 layers. The acquired governing equations are physically interpretable models with low computational cost and high accuracy. Model predictions show good agreement with the measurements. Experimental validation with new process parameters demonstrates the model's generalizability and potential for use in tool temperature control and process optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16501v1-abstract-full').style.display = 'none'; document.getElementById('2401.16501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09432">arXiv:2401.09432</a> <span> [<a href="https://arxiv.org/pdf/2401.09432">pdf</a>, <a href="https://arxiv.org/format/2401.09432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RoleCraft-GLM: Advancing Personalized Role-Playing in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lei Yu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiting Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09432v2-abstract-short" style="display: inline;"> This study presents RoleCraft-GLM, an innovative framework aimed at enhancing personalized role-playing with Large Language Models (LLMs). RoleCraft-GLM addresses the key issue of lacking personalized interactions in conversational AI, and offers a solution with detailed and emotionally nuanced character portrayals. We contribute a unique conversational dataset that shifts from conventional celebr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09432v2-abstract-full').style.display = 'inline'; document.getElementById('2401.09432v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09432v2-abstract-full" style="display: none;"> This study presents RoleCraft-GLM, an innovative framework aimed at enhancing personalized role-playing with Large Language Models (LLMs). RoleCraft-GLM addresses the key issue of lacking personalized interactions in conversational AI, and offers a solution with detailed and emotionally nuanced character portrayals. We contribute a unique conversational dataset that shifts from conventional celebrity-centric characters to diverse, non-celebrity personas, thus enhancing the realism and complexity of language modeling interactions. Additionally, our approach includes meticulous character development, ensuring dialogues are both realistic and emotionally resonant. The effectiveness of RoleCraft-GLM is validated through various case studies, highlighting its versatility and skill in different scenarios. Our framework excels in generating dialogues that accurately reflect characters' personality traits and emotions, thereby boosting user engagement. In conclusion, RoleCraft-GLM marks a significant leap in personalized AI interactions, and paves the way for more authentic and immersive AI-assisted role-playing experiences by enabling more nuanced and emotionally rich dialogues <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09432v2-abstract-full').style.display = 'none'; document.getElementById('2401.09432v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00625">arXiv:2401.00625</a> <span> [<a href="https://arxiv.org/pdf/2401.00625">pdf</a>, <a href="https://arxiv.org/ps/2401.00625">ps</a>, <a href="https://arxiv.org/format/2401.00625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Efficiency: A Systematic Survey of Resource-Efficient Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+G">Guangji Bai</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Z">Zheng Chai</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Chen Ling</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiaying Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Nan Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tingwei Shi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Ziyang Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengdan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yue Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00625v3-abstract-short" style="display: inline;"> The burgeoning field of Large Language Models (LLMs), exemplified by sophisticated models like OpenAI's ChatGPT, represents a significant advancement in artificial intelligence. These models, however, bring forth substantial challenges in the high consumption of computational, memory, energy, and financial resources, especially in environments with limited resource capabilities. This survey aims t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00625v3-abstract-full').style.display = 'inline'; document.getElementById('2401.00625v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00625v3-abstract-full" style="display: none;"> The burgeoning field of Large Language Models (LLMs), exemplified by sophisticated models like OpenAI's ChatGPT, represents a significant advancement in artificial intelligence. These models, however, bring forth substantial challenges in the high consumption of computational, memory, energy, and financial resources, especially in environments with limited resource capabilities. This survey aims to systematically address these challenges by reviewing a broad spectrum of techniques designed to enhance the resource efficiency of LLMs. We categorize methods based on their optimization focus: computational, memory, energy, financial, and network resources and their applicability across various stages of an LLM's lifecycle, including architecture design, pretraining, finetuning, and system design. Additionally, the survey introduces a nuanced categorization of resource efficiency techniques by their specific resource types, which uncovers the intricate relationships and mappings between various resources and corresponding optimization techniques. A standardized set of evaluation metrics and datasets is also presented to facilitate consistent and fair comparisons across different models and techniques. By offering a comprehensive overview of the current sota and identifying open research avenues, this survey serves as a foundational reference for researchers and practitioners, aiding them in developing more sustainable and efficient LLMs in a rapidly evolving landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00625v3-abstract-full').style.display = 'none'; document.getElementById('2401.00625v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GitHub repo: https://github.com/tiingweii-shii/Awesome-Resource-Efficient-LLM-Papers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15599">arXiv:2312.15599</a> <span> [<a href="https://arxiv.org/pdf/2312.15599">pdf</a>, <a href="https://arxiv.org/format/2312.15599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Preliminary Study on Incremental Learning for Large Language Model-based Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianhao Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhijian Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15599v2-abstract-short" style="display: inline;"> Adapting Large Language Models for Recommendation (LLM4Rec) has shown promising results. However, the challenges of deploying LLM4Rec in real-world scenarios remain largely unexplored. In particular, recommender models need incremental adaptation to evolving user preferences, while the suitability of traditional incremental learning methods within LLM4Rec remains ambiguous due to the unique charac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15599v2-abstract-full').style.display = 'inline'; document.getElementById('2312.15599v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15599v2-abstract-full" style="display: none;"> Adapting Large Language Models for Recommendation (LLM4Rec) has shown promising results. However, the challenges of deploying LLM4Rec in real-world scenarios remain largely unexplored. In particular, recommender models need incremental adaptation to evolving user preferences, while the suitability of traditional incremental learning methods within LLM4Rec remains ambiguous due to the unique characteristics of Large Language Models (LLMs). In this study, we empirically evaluate two commonly employed incremental learning strategies (full retraining and fine-tuning) for LLM4Rec. Surprisingly, neither approach shows significant improvements in the performance of LLM4Rec. Instead of dismissing the role of incremental learning, we attribute the lack of anticipated performance enhancement to a mismatch between the LLM4Rec architecture and incremental learning: LLM4Rec employs a single adaptation module for learning recommendations, limiting its ability to simultaneously capture long-term and short-term user preferences in the incremental learning context. To test this speculation, we introduce a Long- and Short-term Adaptation-aware Tuning (LSAT) framework for incremental learning in LLM4Rec. Unlike the single adaptation module approach, LSAT utilizes two distinct adaptation modules to independently learn long-term and short-term user preferences. Empirical results verify that LSAT enhances performance, thereby validating our speculation. We release our code at: https://github.com/TianhaoShi2001/LSAT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15599v2-abstract-full').style.display = 'none'; document.getElementById('2312.15599v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted in the short paper track of the 2024 ACM International Conference on Information and Knowledge Management (CIKM 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15920">arXiv:2311.15920</a> <span> [<a href="https://arxiv.org/pdf/2311.15920">pdf</a>, <a href="https://arxiv.org/format/2311.15920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Fully Data-Driven Approach for Realistic Traffic Signal Control Using Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxiong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shichao Lin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chujie Tian</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Y">Yu Mei</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jian Song</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+X">Xianyuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruimin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15920v1-abstract-short" style="display: inline;"> The optimization of traffic signal control (TSC) is critical for an efficient transportation system. In recent years, reinforcement learning (RL) techniques have emerged as a popular approach for TSC and show promising results for highly adaptive control. However, existing RL-based methods suffer from notably poor real-world applicability and hardly have any successful deployments. The reasons for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15920v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15920v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15920v1-abstract-full" style="display: none;"> The optimization of traffic signal control (TSC) is critical for an efficient transportation system. In recent years, reinforcement learning (RL) techniques have emerged as a popular approach for TSC and show promising results for highly adaptive control. However, existing RL-based methods suffer from notably poor real-world applicability and hardly have any successful deployments. The reasons for such failures are mostly due to the reliance on over-idealized traffic simulators for policy optimization, as well as using unrealistic fine-grained state observations and reward signals that are not directly obtainable from real-world sensors. In this paper, we propose a fully Data-Driven and simulator-free framework for realistic Traffic Signal Control (D2TSC). Specifically, we combine well-established traffic flow theory with machine learning to construct a reward inference model to infer the reward signals from coarse-grained traffic data. With the inferred rewards, we further propose a sample-efficient offline RL method to enable direct signal control policy learning from historical offline datasets of real-world intersections. To evaluate our approach, we collect historical traffic data from a real-world intersection, and develop a highly customized simulation environment that strictly follows real data characteristics. We demonstrate through extensive experiments that our approach achieves superior performance over conventional and offline RL baselines, and also enjoys much better real-world applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15920v1-abstract-full').style.display = 'none'; document.getElementById('2311.15920v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10781">arXiv:2311.10781</a> <span> [<a href="https://arxiv.org/pdf/2311.10781">pdf</a>, <a href="https://arxiv.org/format/2311.10781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Language Model Moderators Improve the Health of Online Discourse? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hyundong Cho</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuai Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Taiwei Shi</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+D">Darpan Jain</a>, <a href="/search/cs?searchtype=author&query=Rizk%2C+B">Basem Rizk</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuyang Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zixun Lu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+N">Nuan Wen</a>, <a href="/search/cs?searchtype=author&query=Gratch%2C+J">Jonathan Gratch</a>, <a href="/search/cs?searchtype=author&query=Ferrara%2C+E">Emilio Ferrara</a>, <a href="/search/cs?searchtype=author&query=May%2C+J">Jonathan May</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10781v2-abstract-short" style="display: inline;"> Conversational moderation of online communities is crucial to maintaining civility for a constructive environment, but it is challenging to scale and harmful to moderators. The inclusion of sophisticated natural language generation modules as a force multiplier to aid human moderators is a tantalizing prospect, but adequate evaluation approaches have so far been elusive. In this paper, we establis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10781v2-abstract-full').style.display = 'inline'; document.getElementById('2311.10781v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10781v2-abstract-full" style="display: none;"> Conversational moderation of online communities is crucial to maintaining civility for a constructive environment, but it is challenging to scale and harmful to moderators. The inclusion of sophisticated natural language generation modules as a force multiplier to aid human moderators is a tantalizing prospect, but adequate evaluation approaches have so far been elusive. In this paper, we establish a systematic definition of conversational moderation effectiveness grounded on moderation literature and establish design criteria for conducting realistic yet safe evaluation. We then propose a comprehensive evaluation framework to assess models' moderation capabilities independently of human intervention. With our framework, we conduct the first known study of language models as conversational moderators, finding that appropriately prompted models that incorporate insights from social science can provide specific and fair feedback on toxic behavior but struggle to influence users to increase their levels of respect and cooperation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10781v2-abstract-full').style.display = 'none'; document.getElementById('2311.10781v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, NAACL 2024 Main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.09632">arXiv:2311.09632</a> <span> [<a href="https://arxiv.org/pdf/2311.09632">pdf</a>, <a href="https://arxiv.org/format/2311.09632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Online Continual Knowledge Learning for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhao Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tongjun Shi</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+K">Karthick Sharma</a>, <a href="/search/cs?searchtype=author&query=Seah%2C+C+W">Chun Wei Seah</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuhao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.09632v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) serve as repositories of extensive world knowledge, enabling them to perform tasks such as question-answering and fact-checking. However, this knowledge can become obsolete as global contexts change. In this paper, we introduce a novel problem in the realm of continual learning: Online Continual Knowledge Learning (OCKL). This problem formulation aims to manage the dyn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09632v1-abstract-full').style.display = 'inline'; document.getElementById('2311.09632v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.09632v1-abstract-full" style="display: none;"> Large Language Models (LLMs) serve as repositories of extensive world knowledge, enabling them to perform tasks such as question-answering and fact-checking. However, this knowledge can become obsolete as global contexts change. In this paper, we introduce a novel problem in the realm of continual learning: Online Continual Knowledge Learning (OCKL). This problem formulation aims to manage the dynamic nature of world knowledge in LMs under real-time constraints. We propose a new benchmark and evaluation metric designed to measure both the rate of new knowledge acquisition and the retention of previously learned knowledge. Our empirical evaluation, conducted using a variety of state-of-the-art methods, establishes robust base-lines for OCKL. Our results reveal that existing continual learning approaches are unfortunately insufficient for tackling the unique challenges posed by OCKL. We identify key factors that influence the trade-off between knowledge acquisition and retention, thereby advancing our understanding of how to train LMs in a continually evolving environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09632v1-abstract-full').style.display = 'none'; document.getElementById('2311.09632v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08685">arXiv:2311.08685</a> <span> [<a href="https://arxiv.org/pdf/2311.08685">pdf</a>, <a href="https://arxiv.org/format/2311.08685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Safer-Instruct: Aligning Language Models with Automated Preference Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Taiwei Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jieyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08685v3-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) is a vital strategy for enhancing model capability in language models. However, annotating preference data for RLHF is a resource-intensive and creativity-demanding process, while existing automatic generation methods face limitations in data diversity and quality. In response, we present Safer-Instruct, a novel pipeline for automatically construct… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08685v3-abstract-full').style.display = 'inline'; document.getElementById('2311.08685v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08685v3-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) is a vital strategy for enhancing model capability in language models. However, annotating preference data for RLHF is a resource-intensive and creativity-demanding process, while existing automatic generation methods face limitations in data diversity and quality. In response, we present Safer-Instruct, a novel pipeline for automatically constructing large-scale preference data. Our approach leverages reversed instruction tuning, instruction induction, and expert model evaluation to efficiently generate high-quality preference data without human annotators. To verify the effectiveness of Safer-Instruct, we apply the pipeline to construct a safety preference dataset as a case study. Finetuning an Alpaca model on this synthetic dataset not only demonstrates improved harmlessness but also outperforms models fine-tuned on human-annotated safety preference data, all the while maintaining a competitive edge in downstream tasks. Importantly, our Safer-Instruct framework is versatile and can be applied to generate preference data across various domains, extending its utility beyond safety preferences. It addresses the challenges in preference data acquisition and advances the development of more capable and responsible AI systems. For dataset and code implementation, see https://github.com/uscnlp-lime/safer-instruct <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08685v3-abstract-full').style.display = 'none'; document.getElementById('2311.08685v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages. NAACL 2024 Camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07613">arXiv:2311.07613</a> <span> [<a href="https://arxiv.org/pdf/2311.07613">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> </div> </div> <p class="title is-5 mathjax"> A Physics-informed Machine Learning-based Control Method for Nonlinear Dynamic Systems with Highly Noisy Measurements </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mason Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajie Wu</a>, <a href="/search/cs?searchtype=author&query=Post%2C+C">Chase Post</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tony Shi</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jingang Yi</a>, <a href="/search/cs?searchtype=author&query=Schmitz%2C+T">Tony Schmitz</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07613v1-abstract-short" style="display: inline;"> This study presents a physics-informed machine learning-based control method for nonlinear dynamic systems with highly noisy measurements. Existing data-driven control methods that use machine learning for system identification cannot effectively cope with highly noisy measurements, resulting in unstable control performance. To address this challenge, the present study extends current physics-info… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07613v1-abstract-full').style.display = 'inline'; document.getElementById('2311.07613v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07613v1-abstract-full" style="display: none;"> This study presents a physics-informed machine learning-based control method for nonlinear dynamic systems with highly noisy measurements. Existing data-driven control methods that use machine learning for system identification cannot effectively cope with highly noisy measurements, resulting in unstable control performance. To address this challenge, the present study extends current physics-informed machine learning capabilities for modeling nonlinear dynamics with control and integrates them into a model predictive control framework. To demonstrate the capability of the proposed method we test and validate with two noisy nonlinear dynamic systems: the chaotic Lorenz 3 system, and turning machine tool. Analysis of the results illustrate that the proposed method outperforms state-of-the-art benchmarks as measured by both modeling accuracy and control performance for nonlinear dynamic systems under high-noise conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07613v1-abstract-full').style.display = 'none'; document.getElementById('2311.07613v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.00706">arXiv:2311.00706</a> <span> [<a href="https://arxiv.org/pdf/2311.00706">pdf</a>, <a href="https://arxiv.org/format/2311.00706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can AI Mitigate Human Perceptual Biases? A Pilot Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geuy%2C+R">Ross Geuy</a>, <a href="/search/cs?searchtype=author&query=Rising%2C+N">Nate Rising</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tiancheng Shi</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+M">Meng Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.00706v1-abstract-short" style="display: inline;"> We present results from a pilot experiment to measure if machine recommendations can debias human perceptual biases in visualization tasks. We specifically studied the ``pull-down'' effect, i.e., people underestimate the average position of lines, for the task of estimating the ensemble average of data points in line charts. These line charts can show for example temperature or precipitation in 12… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00706v1-abstract-full').style.display = 'inline'; document.getElementById('2311.00706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.00706v1-abstract-full" style="display: none;"> We present results from a pilot experiment to measure if machine recommendations can debias human perceptual biases in visualization tasks. We specifically studied the ``pull-down'' effect, i.e., people underestimate the average position of lines, for the task of estimating the ensemble average of data points in line charts. These line charts can show for example temperature or precipitation in 12 months. Six participants estimated ensemble averages with or without an AI assistant. The assistant, when available, responded at three different speeds to assemble the conditions of a human collaborator who may delay his or her responses. Our pilot study showed that participants were faster with AI assistance in ensemble tasks, compared to the baseline without AI assistance. Although ``pull-down'' biases were reduced, the effect of AI assistance was not statistically significant. Also, delaying AI responses had no significant impact on human decision accuracy. We discuss the implications of these preliminary results for subsequent studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00706v1-abstract-full').style.display = 'none'; document.getElementById('2311.00706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted IEEE VIS 2023 VISxVISION Workshop</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shi%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shi%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository