Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 182 results for author: <span class="mathjax">Xiong, W</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xiong%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xiong, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xiong%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xiong, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12073">arXiv:2502.12073</a> <span> [<a href="https://arxiv.org/pdf/2502.12073">pdf</a>, <a href="https://arxiv.org/format/2502.12073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can LLMs Simulate Social Media Engagement? A Study on Action-Guided Response Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zhongyi Qiu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hanjia Lyu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12073v1-abstract-short" style="display: inline;"> Social media enables dynamic user engagement with trending topics, and recent research has explored the potential of large language models (LLMs) for response generation. While some studies investigate LLMs as agents for simulating user behavior on social media, their focus remains on practical viability and scalability rather than a deeper understanding of how well LLM aligns with human behavior.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12073v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12073v1-abstract-full" style="display: none;"> Social media enables dynamic user engagement with trending topics, and recent research has explored the potential of large language models (LLMs) for response generation. While some studies investigate LLMs as agents for simulating user behavior on social media, their focus remains on practical viability and scalability rather than a deeper understanding of how well LLM aligns with human behavior. This paper analyzes LLMs' ability to simulate social media engagement through action guided response generation, where a model first predicts a user's most likely engagement action-retweet, quote, or rewrite-towards a trending post before generating a personalized response conditioned on the predicted action. We benchmark GPT-4o-mini, O1-mini, and DeepSeek-R1 in social media engagement simulation regarding a major societal event discussed on X. Our findings reveal that zero-shot LLMs underperform BERT in action prediction, while few-shot prompting initially degrades the prediction accuracy of LLMs with limited examples. However, in response generation, few-shot LLMs achieve stronger semantic alignment with ground truth posts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12073v1-abstract-full').style.display = 'none'; document.getElementById('2502.12073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09874">arXiv:2502.09874</a> <span> [<a href="https://arxiv.org/pdf/2502.09874">pdf</a>, <a href="https://arxiv.org/format/2502.09874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FrGNet: A fourier-guided weakly-supervised framework for nuclear instance segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+P">Peng Ling</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenxiao Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09874v2-abstract-short" style="display: inline;"> Nuclear instance segmentation has played a critical role in pathology image analysis. The main challenges arise from the difficulty in accurately segmenting instances and the high cost of precise mask-level annotations for fully-supervised training.In this work, we propose a fourier guidance framework for solving the weakly-supervised nuclear instance segmentation problem. In this framework, we co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09874v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09874v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09874v2-abstract-full" style="display: none;"> Nuclear instance segmentation has played a critical role in pathology image analysis. The main challenges arise from the difficulty in accurately segmenting instances and the high cost of precise mask-level annotations for fully-supervised training.In this work, we propose a fourier guidance framework for solving the weakly-supervised nuclear instance segmentation problem. In this framework, we construct a fourier guidance module to fuse the priori information into the training process of the model, which facilitates the model to capture the relevant features of the nuclear. Meanwhile, in order to further improve the model's ability to represent the features of nuclear, we propose the guide-based instance level contrastive module. This module makes full use of the framework's own properties and guide information to effectively enhance the representation features of nuclear. We show on two public datasets that our model can outperform current SOTA methods under fully-supervised design, and in weakly-supervised experiments, with only a small amount of labeling our model still maintains close to the performance under full supervision.In addition, we also perform generalization experiments on a private dataset, and without any labeling, our model is able to segment nuclear images that have not been seen during training quite effectively. As open science, all codes and pre-trained models are available at https://github.com/LQY404/FrGNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09874v2-abstract-full').style.display = 'none'; document.getElementById('2502.09874v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08664">arXiv:2502.08664</a> <span> [<a href="https://arxiv.org/pdf/2502.08664">pdf</a>, <a href="https://arxiv.org/ps/2502.08664">ps</a>, <a href="https://arxiv.org/format/2502.08664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Motion Forecasting for Autonomous Vehicles: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianxin Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinhao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuandong Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunyang Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Wo%2C+T">Tianyu Wo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08664v1-abstract-short" style="display: inline;"> In recent years, the field of autonomous driving has attracted increasingly significant public interest. Accurately forecasting the future behavior of various traffic participants is essential for the decision-making of Autonomous Vehicles (AVs). In this paper, we focus on both scenario-based and perception-based motion forecasting for AVs. We propose a formal problem formulation for motion foreca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08664v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08664v1-abstract-full" style="display: none;"> In recent years, the field of autonomous driving has attracted increasingly significant public interest. Accurately forecasting the future behavior of various traffic participants is essential for the decision-making of Autonomous Vehicles (AVs). In this paper, we focus on both scenario-based and perception-based motion forecasting for AVs. We propose a formal problem formulation for motion forecasting and summarize the main challenges confronting this area of research. We also detail representative datasets and evaluation metrics pertinent to this field. Furthermore, this study classifies recent research into two main categories: supervised learning and self-supervised learning, reflecting the evolving paradigms in both scenario-based and perception-based motion forecasting. In the context of supervised learning, we thoroughly examine and analyze each key element of the methodology. For self-supervised learning, we summarize commonly adopted techniques. The paper concludes and discusses potential research directions, aiming to propel progress in this vital area of AV technology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08664v1-abstract-full').style.display = 'none'; document.getElementById('2502.08664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07460">arXiv:2502.07460</a> <span> [<a href="https://arxiv.org/pdf/2502.07460">pdf</a>, <a href="https://arxiv.org/ps/2502.07460">ps</a>, <a href="https://arxiv.org/format/2502.07460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Logarithmic Regret for Online KL-Regularized Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Heyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chenlu Ye</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Quanquan Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07460v2-abstract-short" style="display: inline;"> Recent advances in Reinforcement Learning from Human Feedback (RLHF) have shown that KL-regularization plays a pivotal role in improving the efficiency of RL fine-tuning for large language models (LLMs). Despite its empirical advantage, the theoretical difference between KL-regularized RL and standard RL remains largely under-explored. While there is a recent line of work on the theoretical analys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07460v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07460v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07460v2-abstract-full" style="display: none;"> Recent advances in Reinforcement Learning from Human Feedback (RLHF) have shown that KL-regularization plays a pivotal role in improving the efficiency of RL fine-tuning for large language models (LLMs). Despite its empirical advantage, the theoretical difference between KL-regularized RL and standard RL remains largely under-explored. While there is a recent line of work on the theoretical analysis of KL-regularized objective in decision making \citep{xiong2024iterative, xie2024exploratory,zhao2024sharp}, these analyses either reduce to the traditional RL setting or rely on strong coverage assumptions. In this paper, we propose an optimism-based KL-regularized online contextual bandit algorithm, and provide a novel analysis of its regret. By carefully leveraging the benign optimization landscape induced by the KL-regularization and the optimistic reward estimation, our algorithm achieves an $\mathcal{O}\big(畏\log (N_{\mathcal R} T)\cdot d_{\mathcal R}\big)$ logarithmic regret bound, where $畏, N_{\mathcal R},T,d_{\mathcal R}$ denote the KL-regularization parameter, the cardinality of the reward function class, number of rounds, and the complexity of the reward function class. Furthermore, we extend our algorithm and analysis to reinforcement learning by developing a novel decomposition over transition steps and also obtain a similar logarithmic regret bound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07460v2-abstract-full').style.display = 'none'; document.getElementById('2502.07460v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03699">arXiv:2502.03699</a> <span> [<a href="https://arxiv.org/pdf/2502.03699">pdf</a>, <a href="https://arxiv.org/format/2502.03699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLM Alignment as Retriever Optimization: An Information Retrieval Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bowen Jin</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J">Jinsung Yoon</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Y">Yu Meng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Arik%2C+S+O">Sercan O. Arik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03699v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have revolutionized artificial intelligence with capabilities in reasoning, coding, and communication, driving innovation across industries. Their true potential depends on effective alignment to ensure correct, trustworthy and ethical behavior, addressing challenges like misinformation, hallucinations, bias and misuse. While existing Reinforcement Learning (RL)-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03699v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03699v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03699v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have revolutionized artificial intelligence with capabilities in reasoning, coding, and communication, driving innovation across industries. Their true potential depends on effective alignment to ensure correct, trustworthy and ethical behavior, addressing challenges like misinformation, hallucinations, bias and misuse. While existing Reinforcement Learning (RL)-based alignment methods are notoriously complex, direct optimization approaches offer a simpler alternative. In this work, we introduce a novel direct optimization approach for LLM alignment by drawing on established Information Retrieval (IR) principles. We present a systematic framework that bridges LLM alignment and IR methodologies, mapping LLM generation and reward models to IR's retriever-reranker paradigm. Building on this foundation, we propose LLM Alignment as Retriever Preference Optimization (LarPO), a new alignment method that enhances overall alignment quality. Extensive experiments validate LarPO's effectiveness with 38.9 % and 13.7 % averaged improvement on AlpacaEval2 and MixEval-Hard respectively. Our work opens new avenues for advancing LLM alignment by integrating IR foundations, offering a promising direction for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03699v1-abstract-full').style.display = 'none'; document.getElementById('2502.03699v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12599">arXiv:2501.12599</a> <span> [<a href="https://arxiv.org/pdf/2501.12599">pdf</a>, <a href="https://arxiv.org/format/2501.12599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Kimi k1.5: Scaling Reinforcement Learning with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kimi+Team"> Kimi Team</a>, <a href="/search/cs?searchtype=author&query=Du%2C+A">Angang Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+B">Bowei Xing</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Changjiu Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chenjun Xiao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+C">Chenzhuang Du</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+C">Chonghua Liao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chuning Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Congcong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dehao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+E">Enming Yuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enzhe Lu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+F">Fengxiang Tang</a>, <a href="/search/cs?searchtype=author&query=Sung%2C+F">Flood Sung</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+G">Guangda Wei</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+G">Guokun Lai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haiqing Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Han Zhu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hao Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> , et al. (69 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12599v1-abstract-short" style="display: inline;"> Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior pu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12599v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12599v1-abstract-full" style="display: none;"> Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior published work has not produced competitive results. In light of this, we report on the training practice of Kimi k1.5, our latest multi-modal LLM trained with RL, including its RL training techniques, multi-modal data recipes, and infrastructure optimization. Long context scaling and improved policy optimization methods are key ingredients of our approach, which establishes a simplistic, effective RL framework without relying on more complex techniques such as Monte Carlo tree search, value functions, and process reward models. Notably, our system achieves state-of-the-art reasoning performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, 96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching OpenAI's o1. Moreover, we present effective long2short methods that use long-CoT techniques to improve short-CoT models, yielding state-of-the-art short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and Claude Sonnet 3.5 by a large margin (up to +550%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12599v1-abstract-full').style.display = 'none'; document.getElementById('2501.12599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03492">arXiv:2501.03492</a> <span> [<a href="https://arxiv.org/pdf/2501.03492">pdf</a>, <a href="https://arxiv.org/format/2501.03492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Source Urban Traffic Flow Forecasting with Drone and Loop Detector Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weijiang Xiong</a>, <a href="/search/cs?searchtype=author&query=Fonod%2C+R">Robert Fonod</a>, <a href="/search/cs?searchtype=author&query=Alahi%2C+A">Alexandre Alahi</a>, <a href="/search/cs?searchtype=author&query=Geroliminis%2C+N">Nikolas Geroliminis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03492v1-abstract-short" style="display: inline;"> Traffic forecasting is a fundamental task in transportation research, however the scope of current research has mainly focused on a single data modality of loop detectors. Recently, the advances in Artificial Intelligence and drone technologies have made possible novel solutions for efficient, accurate and flexible aerial observations of urban traffic. As a promising traffic monitoring approach, d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03492v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03492v1-abstract-full" style="display: none;"> Traffic forecasting is a fundamental task in transportation research, however the scope of current research has mainly focused on a single data modality of loop detectors. Recently, the advances in Artificial Intelligence and drone technologies have made possible novel solutions for efficient, accurate and flexible aerial observations of urban traffic. As a promising traffic monitoring approach, drone-captured data can create an accurate multi-sensor mobility observatory for large-scale urban networks, when combined with existing infrastructure. Therefore, this paper investigates the problem of multi-source traffic speed prediction, simultaneously using drone and loop detector data. A simple yet effective graph-based model HiMSNet is proposed to integrate multiple data modalities and learn spatio-temporal correlations. Detailed analysis shows that predicting accurate segment-level speed is more challenging than the regional speed, especially under high-demand scenarios with heavier congestions and varying traffic dynamics. Utilizing both drone and loop detector data, the prediction accuracy can be improved compared to single-modality cases, when the sensors have lower coverages and are subject to noise. Our simulation study based on vehicle trajectories in a real urban road network has highlighted the added value of integrating drones in traffic forecasting and monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03492v1-abstract-full').style.display = 'none'; document.getElementById('2501.03492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05053">arXiv:2412.05053</a> <span> [<a href="https://arxiv.org/pdf/2412.05053">pdf</a>, <a href="https://arxiv.org/format/2412.05053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EvTTC: An Event Camera Dataset for Time-to-Collision Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+K">Kaizhen Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinghang Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+K">Kuan Dai</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+B">Bangyan Liao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05053v1-abstract-short" style="display: inline;"> Time-to-Collision (TTC) estimation lies in the core of the forward collision warning (FCW) functionality, which is key to all Automatic Emergency Braking (AEB) systems. Although the success of solutions using frame-based cameras (e.g., Mobileye's solutions) has been witnessed in normal situations, some extreme cases, such as the sudden variation in the relative speed of leading vehicles and the su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05053v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05053v1-abstract-full" style="display: none;"> Time-to-Collision (TTC) estimation lies in the core of the forward collision warning (FCW) functionality, which is key to all Automatic Emergency Braking (AEB) systems. Although the success of solutions using frame-based cameras (e.g., Mobileye's solutions) has been witnessed in normal situations, some extreme cases, such as the sudden variation in the relative speed of leading vehicles and the sudden appearance of pedestrians, still pose significant risks that cannot be handled. This is due to the inherent imaging principles of frame-based cameras, where the time interval between adjacent exposures introduces considerable system latency to AEB. Event cameras, as a novel bio-inspired sensor, offer ultra-high temporal resolution and can asynchronously report brightness changes at the microsecond level. To explore the potential of event cameras in the above-mentioned challenging cases, we propose EvTTC, which is, to the best of our knowledge, the first multi-sensor dataset focusing on TTC tasks under high-relative-speed scenarios. EvTTC consists of data collected using standard cameras and event cameras, covering various potential collision scenarios in daily driving and involving multiple collision objects. Additionally, LiDAR and GNSS/INS measurements are provided for the calculation of ground-truth TTC. Considering the high cost of testing TTC algorithms on full-scale mobile platforms, we also provide a small-scale TTC testbed for experimental validation and data augmentation. All the data and the design of the testbed are open sourced, and they can serve as a benchmark that will facilitate the development of vision-based TTC techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05053v1-abstract-full').style.display = 'none'; document.getElementById('2412.05053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03347">arXiv:2412.03347</a> <span> [<a href="https://arxiv.org/pdf/2412.03347">pdf</a>, <a href="https://arxiv.org/format/2412.03347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DIVE: Taming DINO for Subject-Driven Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi Huang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoqi Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianzhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingfu Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03347v1-abstract-short" style="display: inline;"> Building on the success of diffusion models in image generation and editing, video editing has recently gained substantial attention. However, maintaining temporal consistency and motion alignment still remains challenging. To address these issues, this paper proposes DINO-guided Video Editing (DIVE), a framework designed to facilitate subject-driven editing in source videos conditioned on either… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03347v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03347v1-abstract-full" style="display: none;"> Building on the success of diffusion models in image generation and editing, video editing has recently gained substantial attention. However, maintaining temporal consistency and motion alignment still remains challenging. To address these issues, this paper proposes DINO-guided Video Editing (DIVE), a framework designed to facilitate subject-driven editing in source videos conditioned on either target text prompts or reference images with specific identities. The core of DIVE lies in leveraging the powerful semantic features extracted from a pretrained DINOv2 model as implicit correspondences to guide the editing process. Specifically, to ensure temporal motion consistency, DIVE employs DINO features to align with the motion trajectory of the source video. Extensive experiments on diverse real-world videos demonstrate that our framework can achieve high-quality editing results with robust motion consistency, highlighting the potential of DINO to contribute to video editing. For precise subject editing, DIVE incorporates the DINO features of reference images into a pretrained text-to-image model to learn Low-Rank Adaptations (LoRAs), effectively registering the target subject's identity. Project page: https://dino-video-editing.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03347v1-abstract-full').style.display = 'none'; document.getElementById('2412.03347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02635">arXiv:2412.02635</a> <span> [<a href="https://arxiv.org/pdf/2412.02635">pdf</a>, <a href="https://arxiv.org/format/2412.02635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MetaShadow: Object-Centered Shadow Detection, Removal, and Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haitian Zheng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhihong Ding</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+S">Scott Cohen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chi-Wing Fu</a>, <a href="/search/cs?searchtype=author&query=Figueroa%2C+L">Luis Figueroa</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02635v1-abstract-short" style="display: inline;"> Shadows are often under-considered or even ignored in image editing applications, limiting the realism of the edited results. In this paper, we introduce MetaShadow, a three-in-one versatile framework that enables detection, removal, and controllable synthesis of shadows in natural images in an object-centered fashion. MetaShadow combines the strengths of two cooperative components: Shadow Analyze… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02635v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02635v1-abstract-full" style="display: none;"> Shadows are often under-considered or even ignored in image editing applications, limiting the realism of the edited results. In this paper, we introduce MetaShadow, a three-in-one versatile framework that enables detection, removal, and controllable synthesis of shadows in natural images in an object-centered fashion. MetaShadow combines the strengths of two cooperative components: Shadow Analyzer, for object-centered shadow detection and removal, and Shadow Synthesizer, for reference-based controllable shadow synthesis. Notably, we optimize the learning of the intermediate features from Shadow Analyzer to guide Shadow Synthesizer to generate more realistic shadows that blend seamlessly with the scene. Extensive evaluations on multiple shadow benchmark datasets show significant improvements of MetaShadow over the existing state-of-the-art methods on object-centered shadow detection, removal, and synthesis. MetaShadow excels in image-editing tasks such as object removal, relocation, and insertion, pushing the boundaries of object-centered image editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02635v1-abstract-full').style.display = 'none'; document.getElementById('2412.02635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00306">arXiv:2412.00306</a> <span> [<a href="https://arxiv.org/pdf/2412.00306">pdf</a>, <a href="https://arxiv.org/format/2412.00306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Refine-by-Align: Reference-Guided Artifacts Refinement through Semantic Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yizhi Song</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liu He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Price%2C+B">Brian Price</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+S">Scott Cohen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Aliaga%2C+D">Daniel Aliaga</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00306v1-abstract-short" style="display: inline;"> Personalized image generation has emerged from the recent advancements in generative models. However, these generated personalized images often suffer from localized artifacts such as incorrect logos, reducing fidelity and fine-grained identity details of the generated results. Furthermore, there is little prior work tackling this problem. To help improve these identity details in the personalized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00306v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00306v1-abstract-full" style="display: none;"> Personalized image generation has emerged from the recent advancements in generative models. However, these generated personalized images often suffer from localized artifacts such as incorrect logos, reducing fidelity and fine-grained identity details of the generated results. Furthermore, there is little prior work tackling this problem. To help improve these identity details in the personalized image generation, we introduce a new task: reference-guided artifacts refinement. We present Refine-by-Align, a first-of-its-kind model that employs a diffusion-based framework to address this challenge. Our model consists of two stages: Alignment Stage and Refinement Stage, which share weights of a unified neural network model. Given a generated image, a masked artifact region, and a reference image, the alignment stage identifies and extracts the corresponding regional features in the reference, which are then used by the refinement stage to fix the artifacts. Our model-agnostic pipeline requires no test-time tuning or optimization. It automatically enhances image fidelity and reference identity in the generated image, generalizing well to existing models on various tasks including but not limited to customization, generative compositing, view synthesis, and virtual try-on. Extensive experiments and comparisons demonstrate that our pipeline greatly pushes the boundary of fine details in the image synthesis models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00306v1-abstract-full').style.display = 'none'; document.getElementById('2412.00306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19258">arXiv:2410.19258</a> <span> [<a href="https://arxiv.org/pdf/2410.19258">pdf</a>, <a href="https://arxiv.org/format/2410.19258">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Not All Heads Matter: A Head-Level KV Cache Compression Method with Integrated Retrieval and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yu Fu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Asi%2C+A">Abedelkadir Asi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19258v3-abstract-short" style="display: inline;"> Key-Value (KV) caching is a common technique to enhance the computational efficiency of Large Language Models (LLMs), but its memory overhead grows rapidly with input length. Prior work has shown that not all tokens are equally important for text generation, proposing layer-level KV cache compression to selectively retain key information. Recognizing the distinct roles of attention heads in genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19258v3-abstract-full').style.display = 'inline'; document.getElementById('2410.19258v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19258v3-abstract-full" style="display: none;"> Key-Value (KV) caching is a common technique to enhance the computational efficiency of Large Language Models (LLMs), but its memory overhead grows rapidly with input length. Prior work has shown that not all tokens are equally important for text generation, proposing layer-level KV cache compression to selectively retain key information. Recognizing the distinct roles of attention heads in generation, we propose HeadKV, a head-level KV cache compression method, and HeadKV-R2, which leverages a novel contextual reasoning ability estimation for compression. Our approach operates at the level of individual heads, estimating their importance for contextual QA tasks that require both retrieval and reasoning capabilities. Extensive experiments across diverse benchmarks (LongBench, LooGLE), model architectures (e.g., Llama-3-8B-Instruct, Mistral-7B-Instruct), and long-context abilities tests demonstrate that our head-level KV cache compression significantly outperforms strong baselines, particularly in low-resource settings (KV size = 64 & 128). Notably, our method retains just 1.5% of the KV cache while achieving 97% of the performance of the full KV cache on the contextual question answering benchmark.Codes are available at https://github.com/FYYFU/HeadKV <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19258v3-abstract-full').style.display = 'none'; document.getElementById('2410.19258v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07706">arXiv:2410.07706</a> <span> [<a href="https://arxiv.org/pdf/2410.07706">pdf</a>, <a href="https://arxiv.org/format/2410.07706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AgentBank: Towards Generalized LLM Agents via Fine-Tuning on 50000+ Interaction Trajectories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weimin Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiutian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhao Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wei Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07706v1-abstract-short" style="display: inline;"> Fine-tuning on agent-environment interaction trajectory data holds significant promise for surfacing generalized agent capabilities in open-source large language models (LLMs). In this work, we introduce AgentBank, by far the largest trajectory tuning data collection featuring more than 50k diverse high-quality interaction trajectories which comprises 16 tasks covering five distinct agent skill di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07706v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07706v1-abstract-full" style="display: none;"> Fine-tuning on agent-environment interaction trajectory data holds significant promise for surfacing generalized agent capabilities in open-source large language models (LLMs). In this work, we introduce AgentBank, by far the largest trajectory tuning data collection featuring more than 50k diverse high-quality interaction trajectories which comprises 16 tasks covering five distinct agent skill dimensions. Leveraging a novel annotation pipeline, we are able to scale the annotated trajectories and generate a trajectory dataset with minimized difficulty bias. Furthermore, we fine-tune LLMs on AgentBank to get a series of agent models, Samoyed. Our comparative experiments demonstrate the effectiveness of scaling the interaction trajectory data to acquire generalized agent capabilities. Additional studies also reveal some key observations regarding trajectory tuning and agent skill generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07706v1-abstract-full').style.display = 'none'; document.getElementById('2410.07706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01556">arXiv:2410.01556</a> <span> [<a href="https://arxiv.org/pdf/2410.01556">pdf</a>, <a href="https://arxiv.org/format/2410.01556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Integrative Decoding: Improve Factuality via Implicit Self-consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiao Liang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuji Zhang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+W">Wenjun Hou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaishuai Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenge Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jian Jiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01556v4-abstract-short" style="display: inline;"> Self-consistency-based approaches, which involve repeatedly sampling multiple outputs and selecting the most consistent one as the final response, prove to be remarkably effective in improving the factual accuracy of large language models. Nonetheless, existing methods usually have strict constraints on the task format, largely limiting their applicability. In this paper, we present Integrative De… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01556v4-abstract-full').style.display = 'inline'; document.getElementById('2410.01556v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01556v4-abstract-full" style="display: none;"> Self-consistency-based approaches, which involve repeatedly sampling multiple outputs and selecting the most consistent one as the final response, prove to be remarkably effective in improving the factual accuracy of large language models. Nonetheless, existing methods usually have strict constraints on the task format, largely limiting their applicability. In this paper, we present Integrative Decoding (ID), to unlock the potential of self-consistency in open-ended generation tasks. ID operates by constructing a set of inputs, each prepended with a previously sampled response, and then processes them concurrently, with the next token being selected by aggregating of all their corresponding predictions at each decoding step. In essence, this simple approach implicitly incorporates self-consistency in the decoding objective. Extensive evaluation shows that ID consistently enhances factuality over a wide range of language models, with substantial improvements on the TruthfulQA (+11.2%), Biographies (+15.4%) and LongFact (+8.5%) benchmarks. The performance gains amplify progressively as the number of sampled responses increases, indicating the potential of ID to scale up with repeated sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01556v4-abstract-full').style.display = 'none'; document.getElementById('2410.01556v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19951">arXiv:2409.19951</a> <span> [<a href="https://arxiv.org/pdf/2409.19951">pdf</a>, <a href="https://arxiv.org/format/2409.19951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Law of the Weakest Link: Cross Capabilities of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+M">Ming Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aston Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuewei Wang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+R">Rui Hou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenhan Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenguang Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengxing Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+L">Liang Tan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+C">Chloe Bi</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Popuri%2C+S">Sravya Popuri</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+S">Sharan Narang</a>, <a href="/search/cs?searchtype=author&query=Kambadur%2C+M">Melanie Kambadur</a>, <a href="/search/cs?searchtype=author&query=Mahajan%2C+D">Dhruv Mahajan</a>, <a href="/search/cs?searchtype=author&query=Edunov%2C+S">Sergey Edunov</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=van+der+Maaten%2C+L">Laurens van der Maaten</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19951v2-abstract-short" style="display: inline;"> The development and evaluation of Large Language Models (LLMs) have largely focused on individual capabilities. However, this overlooks the intersection of multiple abilities across different types of expertise that are often required for real-world tasks, which we term cross capabilities. To systematically explore this concept, we first define seven core individual capabilities and then pair them… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19951v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19951v2-abstract-full" style="display: none;"> The development and evaluation of Large Language Models (LLMs) have largely focused on individual capabilities. However, this overlooks the intersection of multiple abilities across different types of expertise that are often required for real-world tasks, which we term cross capabilities. To systematically explore this concept, we first define seven core individual capabilities and then pair them to form seven common cross capabilities, each supported by a manually constructed taxonomy. Building on these definitions, we introduce CrossEval, a benchmark comprising 1,400 human-annotated prompts, with 100 prompts for each individual and cross capability. To ensure reliable evaluation, we involve expert annotators to assess 4,200 model responses, gathering 8,400 human ratings with detailed explanations to serve as reference examples. Our findings reveal that, in both static evaluations and attempts to enhance specific abilities, current LLMs consistently exhibit the "Law of the Weakest Link," where cross-capability performance is significantly constrained by the weakest component. Specifically, across 58 cross-capability scores from 17 models, 38 scores are lower than all individual capabilities, while 20 fall between strong and weak, but closer to the weaker ability. These results highlight the under-performance of LLMs in cross-capability tasks, making the identification and improvement of the weakest capabilities a critical priority for future research to optimize performance in complex, multi-dimensional scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19951v2-abstract-full').style.display = 'none'; document.getElementById('2409.19951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Data, Code, & Benchmark: www.llm-cross-capabilities.org</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13156">arXiv:2409.13156</a> <span> [<a href="https://arxiv.org/pdf/2409.13156">pdf</a>, <a href="https://arxiv.org/format/2409.13156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RRM: Robust Reward Model Training Mitigates Reward Hacking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lichang Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junru Wu</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tianhe Yu</a>, <a href="/search/cs?searchtype=author&query=Sohn%2C+D">Daniel Sohn</a>, <a href="/search/cs?searchtype=author&query=Makarova%2C+A">Anastasiia Makarova</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jeremiah Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Ittycheriah%2C+A">Abe Ittycheriah</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Aviral Kumar</a>, <a href="/search/cs?searchtype=author&query=Saleh%2C+M">Mohammad Saleh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13156v1-abstract-short" style="display: inline;"> Reward models (RMs) play a pivotal role in aligning large language models (LLMs) with human preferences. However, traditional RM training, which relies on response pairs tied to specific prompts, struggles to disentangle prompt-driven preferences from prompt-independent artifacts, such as response length and format. In this work, we expose a fundamental limitation of current RM training methods, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13156v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13156v1-abstract-full" style="display: none;"> Reward models (RMs) play a pivotal role in aligning large language models (LLMs) with human preferences. However, traditional RM training, which relies on response pairs tied to specific prompts, struggles to disentangle prompt-driven preferences from prompt-independent artifacts, such as response length and format. In this work, we expose a fundamental limitation of current RM training methods, where RMs fail to effectively distinguish between contextual signals and irrelevant artifacts when determining preferences. To address this, we introduce a causal framework that learns preferences independent of these artifacts and propose a novel data augmentation technique designed to eliminate them. Extensive experiments show that our approach successfully filters out undesirable artifacts, yielding a more robust reward model (RRM). Our RRM improves the performance of a pairwise reward model trained on Gemma-2-9b-it, on RewardBench, increasing accuracy from 80.61% to 84.15%. Additionally, we train two DPO policies using both the RM and RRM, demonstrating that the RRM significantly enhances DPO-aligned policies, improving MT-Bench scores from 7.27 to 8.31 and length-controlled win-rates in AlpacaEval-2 from 33.46% to 52.49%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13156v1-abstract-full').style.display = 'none'; document.getElementById('2409.13156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12979">arXiv:2409.12979</a> <span> [<a href="https://arxiv.org/pdf/2409.12979">pdf</a>, <a href="https://arxiv.org/format/2409.12979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can we only use guideline instead of shot in prompt? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxiang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhucong Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zenglin Xu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12979v1-abstract-short" style="display: inline;"> Currently, prompting techniques can be mainly divided into two categories:1)shot method implicitly inspires the model to answer the question by mimicing the steps in the given example, e.g., the few-shot CoT. 2) Guideline method explicitly instructs the model to reason by following guidelines, which contains succinct and concise task-specific knowledge. Shot method is prone to difficulties in term… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12979v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12979v1-abstract-full" style="display: none;"> Currently, prompting techniques can be mainly divided into two categories:1)shot method implicitly inspires the model to answer the question by mimicing the steps in the given example, e.g., the few-shot CoT. 2) Guideline method explicitly instructs the model to reason by following guidelines, which contains succinct and concise task-specific knowledge. Shot method is prone to difficulties in terms of selection of shots type, the number of shots, and the design of the reasoning steps, so a question arises: can we only use guideline instead of shot in the prompt? To this end, we propose the FGT framework to automatically learn task-specific guidelines from dataset consisting of Feedback, Guideline, and Tree-gather agents. First, the feedback agent is designed to evaluate the outcomes, both right and wrong, of each Q&A to gather insights guiding more effective optimization strategies. Next, the guideline agent is tasked with deriving guidelines from each piece of feedback and storing them in local memory. Lastly, the tree-gather agent aggregates all guidelines hierarchically through a tree structure, ultimately obtaining all unduplicated guidelines from a global perspective. In addition, we induce the model to generate intermediate processes to ensure the reasoning consistent with the guidelines. Experimental results demonstrate that our approach achieves superior performance across multiple tasks, thereby highlighting the effectiveness of using the guidelines in prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12979v1-abstract-full').style.display = 'none'; document.getElementById('2409.12979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11704">arXiv:2409.11704</a> <span> [<a href="https://arxiv.org/pdf/2409.11704">pdf</a>, <a href="https://arxiv.org/format/2409.11704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Lists to Emojis: How Format Bias Affects Model Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanchang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lichang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11704v1-abstract-short" style="display: inline;"> In this paper, we study format biases in reinforcement learning from human feedback (RLHF). We observe that many widely-used preference models, including human evaluators, GPT-4, and top-ranking models on the RewardBench benchmark, exhibit strong biases towards specific format patterns, such as lists, links, bold text, and emojis. Furthermore, large language models (LLMs) can exploit these biases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11704v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11704v1-abstract-full" style="display: none;"> In this paper, we study format biases in reinforcement learning from human feedback (RLHF). We observe that many widely-used preference models, including human evaluators, GPT-4, and top-ranking models on the RewardBench benchmark, exhibit strong biases towards specific format patterns, such as lists, links, bold text, and emojis. Furthermore, large language models (LLMs) can exploit these biases to achieve higher rankings on popular benchmarks like AlpacaEval and LMSYS Chatbot Arena. One notable example of this is verbosity bias, where current preference models favor longer responses that appear more comprehensive, even when their quality is equal to or lower than shorter, competing responses. However, format biases beyond verbosity remain largely underexplored in the literature. In this work, we extend the study of biases in preference learning beyond the commonly recognized length bias, offering a comprehensive analysis of a wider range of format biases. Additionally, we show that with a small amount of biased data (less than 1%), we can inject significant bias into the reward model. Moreover, these format biases can also be easily exploited by downstream alignment algorithms, such as best-of-n sampling and online iterative DPO, as it is usually easier to manipulate the format than to improve the quality of responses. Our findings emphasize the need to disentangle format and content both for designing alignment algorithms and evaluating models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11704v1-abstract-full').style.display = 'none'; document.getElementById('2409.11704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10760">arXiv:2409.10760</a> <span> [<a href="https://arxiv.org/pdf/2409.10760">pdf</a>, <a href="https://arxiv.org/format/2409.10760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Semantics Preserving Emoji Recommendation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zhongyi Qiu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+K">Kangyi Qiu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hanjia Lyu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10760v1-abstract-short" style="display: inline;"> Emojis have become an integral part of digital communication, enriching text by conveying emotions, tone, and intent. Existing emoji recommendation methods are primarily evaluated based on their ability to match the exact emoji a user chooses in the original text. However, they ignore the essence of users' behavior on social media in that each text can correspond to multiple reasonable emojis. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10760v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10760v1-abstract-full" style="display: none;"> Emojis have become an integral part of digital communication, enriching text by conveying emotions, tone, and intent. Existing emoji recommendation methods are primarily evaluated based on their ability to match the exact emoji a user chooses in the original text. However, they ignore the essence of users' behavior on social media in that each text can correspond to multiple reasonable emojis. To better assess a model's ability to align with such real-world emoji usage, we propose a new semantics preserving evaluation framework for emoji recommendation, which measures a model's ability to recommend emojis that maintain the semantic consistency with the user's text. To evaluate how well a model preserves semantics, we assess whether the predicted affective state, demographic profile, and attitudinal stance of the user remain unchanged. If these attributes are preserved, we consider the recommended emojis to have maintained the original semantics. The advanced abilities of Large Language Models (LLMs) in understanding and generating nuanced, contextually relevant output make them well-suited for handling the complexities of semantics preserving emoji recommendation. To this end, we construct a comprehensive benchmark to systematically assess the performance of six proprietary and open-source LLMs using different prompting techniques on our task. Our experiments demonstrate that GPT-4o outperforms other LLMs, achieving a semantics preservation score of 79.23%. Additionally, we conduct case studies to analyze model biases in downstream classification tasks and evaluate the diversity of the recommended emojis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10760v1-abstract-full').style.display = 'none'; document.getElementById('2409.10760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08520">arXiv:2409.08520</a> <span> [<a href="https://arxiv.org/pdf/2409.08520">pdf</a>, <a href="https://arxiv.org/format/2409.08520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GroundingBooth: Grounding Text-to-Image Customization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhexiao Xiong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jing Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yizhi Song</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+N">Nathan Jacobs</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08520v2-abstract-short" style="display: inline;"> Recent studies in text-to-image customization show great success in generating personalized object variants given several images of a subject. While existing methods focus more on preserving the identity of the subject, they often fall short of controlling the spatial relationship between objects. In this work, we introduce GroundingBooth, a framework that achieves zero-shot instance-level spatial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08520v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08520v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08520v2-abstract-full" style="display: none;"> Recent studies in text-to-image customization show great success in generating personalized object variants given several images of a subject. While existing methods focus more on preserving the identity of the subject, they often fall short of controlling the spatial relationship between objects. In this work, we introduce GroundingBooth, a framework that achieves zero-shot instance-level spatial grounding on both foreground subjects and background objects in the text-to-image customization task. Our proposed text-image grounding module and masked cross-attention layer allow us to generate personalized images with both accurate layout alignment and identity preservation while maintaining text-image coherence. With such layout control, our model inherently enables the customization of multiple subjects at once. Our model is evaluated on both layout-guided image synthesis and reference-based customization tasks, showing strong results compared to existing methods. Our work is the first work to achieve a joint grounding on both subject-driven foreground generation and text-driven background generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08520v2-abstract-full').style.display = 'none'; document.getElementById('2409.08520v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02689">arXiv:2409.02689</a> <span> [<a href="https://arxiv.org/pdf/2409.02689">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Frequency-domain Parallel Computing Using Single On-Chip Nonlinear Acoustic-wave Device </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jun Ji</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Z">Zichen Xi</a>, <a href="/search/cs?searchtype=author&query=Srijanto%2C+B+R">Bernadeta R. Srijanto</a>, <a href="/search/cs?searchtype=author&query=Kravchenko%2C+I+I">Ivan I. Kravchenko</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenjie Xiong</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+L">Linbo Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02689v1-abstract-short" style="display: inline;"> Multiply-accumulation (MAC) is a crucial computing operation in signal processing, numerical simulations, and machine learning. This work presents a scalable, programmable, frequency-domain parallel computing leveraging gigahertz (GHz)-frequency acoustic-wave nonlinearities. By encoding data in the frequency domain, a single nonlinear acoustic-wave device can perform a billion arithmetic operation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02689v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02689v1-abstract-full" style="display: none;"> Multiply-accumulation (MAC) is a crucial computing operation in signal processing, numerical simulations, and machine learning. This work presents a scalable, programmable, frequency-domain parallel computing leveraging gigahertz (GHz)-frequency acoustic-wave nonlinearities. By encoding data in the frequency domain, a single nonlinear acoustic-wave device can perform a billion arithmetic operations simultaneously. A single device with a footprint of 0.03 mm$^2$ on lithium niobate (LN) achieves 0.0144 tera floating-point operations per second (TFLOPS), leading to a computing area density of 0.48 TFLOPS/mm$^2$ and a core power efficiency of 0.14 TFLOPS/Watt. As applications, we demonstrate multiplications of two 16-by-16 matrices and convolutional imaging processing of 128-by-128-pixel photos. Our technology could find versatile applications in near-sensor signal processing and edge computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02689v1-abstract-full').style.display = 'none'; document.getElementById('2409.02689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02392">arXiv:2409.02392</a> <span> [<a href="https://arxiv.org/pdf/2409.02392">pdf</a>, <a href="https://arxiv.org/format/2409.02392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Building Math Agents with Multi-Turn Iterative Preference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chengshuai Shi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&query=Rosenberg%2C+A">Aviv Rosenberg</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Khalman%2C+M">Misha Khalman</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Saleh%2C+M">Mohammad Saleh</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02392v1-abstract-short" style="display: inline;"> Recent studies have shown that large language models' (LLMs) mathematical problem-solving capabilities can be enhanced by integrating external tools, such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) reasoning. While current methods focus on synthetic data generation and Supervised Fine-Tuning (SFT), this paper studies the complementary direct preference learning approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02392v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02392v1-abstract-full" style="display: none;"> Recent studies have shown that large language models' (LLMs) mathematical problem-solving capabilities can be enhanced by integrating external tools, such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) reasoning. While current methods focus on synthetic data generation and Supervised Fine-Tuning (SFT), this paper studies the complementary direct preference learning approach to further improve model performance. However, existing direct preference learning algorithms are originally designed for the single-turn chat task, and do not fully address the complexities of multi-turn reasoning and external tool integration required for tool-integrated mathematical reasoning tasks. To fill in this gap, we introduce a multi-turn direct preference learning framework, tailored for this context, that leverages feedback from code interpreters and optimizes trajectory-level preferences. This framework includes multi-turn DPO and multi-turn KTO as specific implementations. The effectiveness of our framework is validated through training of various language models using an augmented prompt set from the GSM8K and MATH datasets. Our results demonstrate substantial improvements: a supervised fine-tuned Gemma-1.1-it-7B model's performance increased from 77.5% to 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B model improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02392v1-abstract-full').style.display = 'none'; document.getElementById('2409.02392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A multi-turn direct preference learning framework for tool-integrated reasoning tasks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14498">arXiv:2408.14498</a> <span> [<a href="https://arxiv.org/pdf/2408.14498">pdf</a>, <a href="https://arxiv.org/format/2408.14498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Normal Prototypes Learning for Weakly Supervised Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhijin Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongzhi Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+B">Boyuan Ren</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weimin Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhonghai Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14498v2-abstract-short" style="display: inline;"> Anomaly detection is a crucial task in various domains. Most of the existing methods assume the normal sample data clusters around a single central prototype while the real data may consist of multiple categories or subgroups. In addition, existing methods always assume all unlabeled samples are normal while some of them are inevitably being anomalies. To address these issues, we propose a novel a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14498v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14498v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14498v2-abstract-full" style="display: none;"> Anomaly detection is a crucial task in various domains. Most of the existing methods assume the normal sample data clusters around a single central prototype while the real data may consist of multiple categories or subgroups. In addition, existing methods always assume all unlabeled samples are normal while some of them are inevitably being anomalies. To address these issues, we propose a novel anomaly detection framework that can efficiently work with limited labeled anomalies. Specifically, we assume the normal sample data may consist of multiple subgroups, and propose to learn multi-normal prototypes to represent them with deep embedding clustering and contrastive learning. Additionally, we propose a method to estimate the likelihood of each unlabeled sample being normal during model training, which can help to learn more efficient data encoder and normal prototypes for anomaly detection. Extensive experiments on various datasets demonstrate the superior performance of our method compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14498v2-abstract-full').style.display = 'none'; document.getElementById('2408.14498v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00106">arXiv:2408.00106</a> <span> [<a href="https://arxiv.org/pdf/2408.00106">pdf</a>, <a href="https://arxiv.org/format/2408.00106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WAS: Dataset and Methods for Artistic Text Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xudong Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuzhe Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaowen Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00106v1-abstract-short" style="display: inline;"> Accurate text segmentation results are crucial for text-related generative tasks, such as text image generation, text editing, text removal, and text style transfer. Recently, some scene text segmentation methods have made significant progress in segmenting regular text. However, these methods perform poorly in scenarios containing artistic text. Therefore, this paper focuses on the more challengi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00106v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00106v1-abstract-full" style="display: none;"> Accurate text segmentation results are crucial for text-related generative tasks, such as text image generation, text editing, text removal, and text style transfer. Recently, some scene text segmentation methods have made significant progress in segmenting regular text. However, these methods perform poorly in scenarios containing artistic text. Therefore, this paper focuses on the more challenging task of artistic text segmentation and constructs a real artistic text segmentation dataset. One challenge of the task is that the local stroke shapes of artistic text are changeable with diversity and complexity. We propose a decoder with the layer-wise momentum query to prevent the model from ignoring stroke regions of special shapes. Another challenge is the complexity of the global topological structure. We further design a skeleton-assisted head to guide the model to focus on the global structure. Additionally, to enhance the generalization performance of the text segmentation model, we propose a strategy for training data synthesis, based on the large multi-modal model and the diffusion model. Experimental results show that our proposed method and synthetic dataset can significantly enhance the performance of artistic text segmentation and achieve state-of-the-art results on other public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00106v1-abstract-full').style.display = 'none'; document.getElementById('2408.00106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a> <span> [<a href="https://arxiv.org/pdf/2407.21783">pdf</a>, <a href="https://arxiv.org/format/2407.21783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Llama 3 Herd of Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grattafiori%2C+A">Aaron Grattafiori</a>, <a href="/search/cs?searchtype=author&query=Dubey%2C+A">Abhimanyu Dubey</a>, <a href="/search/cs?searchtype=author&query=Jauhri%2C+A">Abhinav Jauhri</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+A">Abhinav Pandey</a>, <a href="/search/cs?searchtype=author&query=Kadian%2C+A">Abhishek Kadian</a>, <a href="/search/cs?searchtype=author&query=Al-Dahle%2C+A">Ahmad Al-Dahle</a>, <a href="/search/cs?searchtype=author&query=Letman%2C+A">Aiesha Letman</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Akhil Mathur</a>, <a href="/search/cs?searchtype=author&query=Schelten%2C+A">Alan Schelten</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+A">Alex Vaughan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Amy Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Hartshorn%2C+A">Anthony Hartshorn</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Aobo Yang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+A">Archi Mitra</a>, <a href="/search/cs?searchtype=author&query=Sravankumar%2C+A">Archie Sravankumar</a>, <a href="/search/cs?searchtype=author&query=Korenev%2C+A">Artem Korenev</a>, <a href="/search/cs?searchtype=author&query=Hinsvark%2C+A">Arthur Hinsvark</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Arun Rao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aston Zhang</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+A">Aurelien Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Gregerson%2C+A">Austen Gregerson</a>, <a href="/search/cs?searchtype=author&query=Spataru%2C+A">Ava Spataru</a>, <a href="/search/cs?searchtype=author&query=Roziere%2C+B">Baptiste Roziere</a> , et al. (536 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21783v3-abstract-short" style="display: inline;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21783v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21783v3-abstract-full" style="display: none;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'none'; document.getElementById('2407.21783v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20756">arXiv:2407.20756</a> <span> [<a href="https://arxiv.org/pdf/2407.20756">pdf</a>, <a href="https://arxiv.org/format/2407.20756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SynthVLM: High-Efficiency and High-Quality Synthetic Data for Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bozhou Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+T">Tianyi Bai</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wentao Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Bin Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20756v4-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have recently emerged, demonstrating remarkable vision-understanding capabilities. However, training these models requires large-scale datasets, which brings challenges related to efficiency, effectiveness, quality, and privacy of web data. In this paper, we introduce SynthVLM, a novel data synthesis and curation method for generating image-caption pairs. Unlike tradi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20756v4-abstract-full').style.display = 'inline'; document.getElementById('2407.20756v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20756v4-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have recently emerged, demonstrating remarkable vision-understanding capabilities. However, training these models requires large-scale datasets, which brings challenges related to efficiency, effectiveness, quality, and privacy of web data. In this paper, we introduce SynthVLM, a novel data synthesis and curation method for generating image-caption pairs. Unlike traditional methods, where captions are generated from images, SynthVLM utilizes advanced diffusion models and high-quality captions to automatically synthesize and select high-resolution images from text descriptions, thereby creating precisely aligned image-text pairs. To demonstrate the power of SynthVLM, we introduce SynthVLM-100K, a high-quality dataset consisting of 100,000 curated and synthesized image-caption pairs. In both model and human evaluations, SynthVLM-100K outperforms traditional real-world datasets. Leveraging this dataset, we develop a new family of multimodal large language models (MLLMs), SynthVLM-7B and SynthVLM-13B, which achieve state-of-the-art (SOTA) performance on various vision question-answering (VQA) tasks. Notably, our models outperform LLaVA across most metrics with only 18\% pretrain data. Furthermore, SynthVLM-7B and SynthVLM-13B attain SOTA performance on the MMLU benchmark, demonstrating that the high-quality SynthVLM-100K dataset preserves language abilities. To facilitate future research, our dataset and the complete data generating and curating methods are open-sourced at https://github.com/starriver030515/SynthVLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20756v4-abstract-full').style.display = 'none'; document.getElementById('2407.20756v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15441">arXiv:2407.15441</a> <span> [<a href="https://arxiv.org/pdf/2407.15441">pdf</a>, <a href="https://arxiv.org/format/2407.15441">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Developing a Reliable, General-Purpose Hallucination Detection and Mitigation Service: Insights and Lessons Learned </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jie Mei</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yujia Xie</a>, <a href="/search/cs?searchtype=author&query=Muarray%2C+S">Sean Muarray</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lingfeng Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si-Qing Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15441v1-abstract-short" style="display: inline;"> Hallucination, a phenomenon where large language models (LLMs) produce output that is factually incorrect or unrelated to the input, is a major challenge for LLM applications that require accuracy and dependability. In this paper, we introduce a reliable and high-speed production system aimed at detecting and rectifying the hallucination issue within LLMs. Our system encompasses named entity recog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15441v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15441v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15441v1-abstract-full" style="display: none;"> Hallucination, a phenomenon where large language models (LLMs) produce output that is factually incorrect or unrelated to the input, is a major challenge for LLM applications that require accuracy and dependability. In this paper, we introduce a reliable and high-speed production system aimed at detecting and rectifying the hallucination issue within LLMs. Our system encompasses named entity recognition (NER), natural language inference (NLI), span-based detection (SBD), and an intricate decision tree-based process to reliably detect a wide range of hallucinations in LLM responses. Furthermore, our team has crafted a rewriting mechanism that maintains an optimal mix of precision, response time, and cost-effectiveness. We detail the core elements of our framework and underscore the paramount challenges tied to response time, availability, and performance metrics, which are crucial for real-world deployment of these technologies. Our extensive evaluation, utilizing offline data and live production traffic, confirms the efficacy of our proposed framework and service. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15441v1-abstract-full').style.display = 'none'; document.getElementById('2407.15441v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06654">arXiv:2407.06654</a> <span> [<a href="https://arxiv.org/pdf/2407.06654">pdf</a>, <a href="https://arxiv.org/format/2407.06654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SoftDedup: an Efficient Data Reweighting Method for Speeding Up Language Model Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+N">Nan He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weichen Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanwen Liu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yi Liao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Lei Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+G">Guohua Tang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06654v1-abstract-short" style="display: inline;"> The effectiveness of large language models (LLMs) is often hindered by duplicated data in their extensive pre-training datasets. Current approaches primarily focus on detecting and removing duplicates, which risks the loss of valuable information and neglects the varying degrees of duplication. To address this, we propose a soft deduplication method that maintains dataset integrity while selective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06654v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06654v1-abstract-full" style="display: none;"> The effectiveness of large language models (LLMs) is often hindered by duplicated data in their extensive pre-training datasets. Current approaches primarily focus on detecting and removing duplicates, which risks the loss of valuable information and neglects the varying degrees of duplication. To address this, we propose a soft deduplication method that maintains dataset integrity while selectively reducing the sampling weight of data with high commonness. Central to our approach is the concept of "data commonness", a metric we introduce to quantify the degree of duplication by measuring the occurrence probabilities of samples using an n-gram model. Empirical analysis shows that this method significantly improves training efficiency, achieving comparable perplexity scores with at least a 26% reduction in required training steps. Additionally, it enhances average few-shot downstream accuracy by 1.77% when trained for an equivalent duration. Importantly, this approach consistently improves performance, even on rigorously deduplicated datasets, indicating its potential to complement existing methods and become a standard pre-training process for LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06654v1-abstract-full').style.display = 'none'; document.getElementById('2407.06654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16079">arXiv:2406.16079</a> <span> [<a href="https://arxiv.org/pdf/2406.16079">pdf</a>, <a href="https://arxiv.org/format/2406.16079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EERPD: Leveraging Emotion and Emotion Regulation for Improving Personality Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qilong Ma</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weimin Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16079v1-abstract-short" style="display: inline;"> Personality is a fundamental construct in psychology, reflecting an individual's behavior, thinking, and emotional patterns. Previous researches have made some progress in personality detection, primarily by utilizing the whole text to predict personality. However, these studies generally tend to overlook psychological knowledge: they rarely apply the well-established correlations between emotion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16079v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16079v1-abstract-full" style="display: none;"> Personality is a fundamental construct in psychology, reflecting an individual's behavior, thinking, and emotional patterns. Previous researches have made some progress in personality detection, primarily by utilizing the whole text to predict personality. However, these studies generally tend to overlook psychological knowledge: they rarely apply the well-established correlations between emotion regulation and personality. Based on this, we propose a new personality detection method called EERPD. This method introduces the use of emotion regulation, a psychological concept highly correlated with personality, for personality prediction. By combining this feature with emotion features, it retrieves few-shot examples and provides process CoTs for inferring labels from text. This approach enhances the understanding of LLM for personality within text and improves the performance in personality detection. Experimental results demonstrate that EERPD significantly enhances the accuracy and robustness of personality detection, outperforming previous SOTA by 15.05/4.29 in average F1 on the two benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16079v1-abstract-full').style.display = 'none'; document.getElementById('2406.16079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12845">arXiv:2406.12845</a> <span> [<a href="https://arxiv.org/pdf/2406.12845">pdf</a>, <a href="https://arxiv.org/format/2406.12845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Preferences via Multi-Objective Reward Modeling and Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tengyang Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12845v1-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has emerged as the primary method for aligning large language models (LLMs) with human preferences. The RLHF process typically starts by training a reward model (RM) using human preference data. Conventional RMs are trained on pairwise responses to the same user request, with relative ratings indicating which response humans prefer. The trained RM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12845v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12845v1-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has emerged as the primary method for aligning large language models (LLMs) with human preferences. The RLHF process typically starts by training a reward model (RM) using human preference data. Conventional RMs are trained on pairwise responses to the same user request, with relative ratings indicating which response humans prefer. The trained RM serves as a proxy for human preferences. However, due to the black-box nature of RMs, their outputs lack interpretability, as humans cannot intuitively understand why an RM thinks a response is good or not. As RMs act as human preference proxies, we believe they should be human-interpretable to ensure that their internal decision processes are consistent with human preferences and to prevent reward hacking in LLM alignment. To build RMs with interpretable preferences, we propose a two-stage approach: i) train an Absolute-Rating Multi-Objective Reward Model (ArmoRM) with multi-dimensional absolute-rating data, each dimension corresponding to a human-interpretable objective (e.g., honesty, verbosity, safety); ii) employ a Mixture-of-Experts (MoE) strategy with a gating network that automatically selects the most suitable reward objectives based on the context. We efficiently trained an ArmoRM with Llama-3 8B and a gating network consisting of a shallow MLP on top of the ArmoRM. Our trained model, ArmoRM-Llama3-8B, obtains state-of-the-art performance on RewardBench, a benchmark evaluating RMs for language modeling. Notably, the performance of our model surpasses the LLM-as-a-judge method with GPT-4 judges by a margin, and approaches the performance of the much larger Nemotron-4 340B reward model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12845v1-abstract-full').style.display = 'none'; document.getElementById('2406.12845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report v1. Code and model are released at https://github.com/RLHFlow/RLHF-Reward-Modeling/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11176">arXiv:2406.11176</a> <span> [<a href="https://arxiv.org/pdf/2406.11176">pdf</a>, <a href="https://arxiv.org/format/2406.11176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Watch Every Step! LLM Agent Learning via Iterative Step-Level Process Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weimin Xiong</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiutian Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhao Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wei Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11176v2-abstract-short" style="display: inline;"> Large language model agents have exhibited exceptional performance across a range of complex interactive tasks. Recent approaches have utilized tuning with expert trajectories to enhance agent performance, yet they primarily concentrate on outcome rewards, which may lead to errors or suboptimal actions due to the absence of process supervision signals. In this paper, we introduce the Iterative ste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11176v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11176v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11176v2-abstract-full" style="display: none;"> Large language model agents have exhibited exceptional performance across a range of complex interactive tasks. Recent approaches have utilized tuning with expert trajectories to enhance agent performance, yet they primarily concentrate on outcome rewards, which may lead to errors or suboptimal actions due to the absence of process supervision signals. In this paper, we introduce the Iterative step-level Process Refinement (IPR) framework, which provides detailed step-by-step guidance to enhance agent training. Specifically, we adopt the Monte Carlo method to estimate step-level rewards. During each iteration, the agent explores along the expert trajectory and generates new actions. These actions are then evaluated against the corresponding step of expert trajectory using step-level rewards. Such comparison helps identify discrepancies, yielding contrastive action pairs that serve as training data for the agent. Our experiments on three complex agent tasks demonstrate that our framework outperforms a variety of strong baselines. Moreover, our analytical findings highlight the effectiveness of IPR in augmenting action efficiency and its applicability to diverse models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11176v2-abstract-full').style.display = 'none'; document.getElementById('2406.11176v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 (Main Conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02069">arXiv:2406.02069</a> <span> [<a href="https://arxiv.org/pdf/2406.02069">pdf</a>, <a href="https://arxiv.org/format/2406.02069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PyramidKV: Dynamic KV Cache Compression based on Pyramidal Information Funneling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuliang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Baobao Chang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02069v3-abstract-short" style="display: inline;"> In this study, we investigate whether attention-based information flow inside large language models (LLMs) is aggregated through noticeable patterns for long context processing. Our observations reveal that LLMs aggregate information through Pyramidal Information Funneling where attention is scattering widely in lower layers, progressively consolidating within specific contexts, and ultimately foc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02069v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02069v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02069v3-abstract-full" style="display: none;"> In this study, we investigate whether attention-based information flow inside large language models (LLMs) is aggregated through noticeable patterns for long context processing. Our observations reveal that LLMs aggregate information through Pyramidal Information Funneling where attention is scattering widely in lower layers, progressively consolidating within specific contexts, and ultimately focusing on critical tokens (a.k.a massive activation or attention sink) in higher layers. Motivated by these insights, we developed PyramidKV, a novel and effective KV cache compression method. This approach dynamically adjusts the KV cache size across different layers, allocating more cache in lower layers and less in higher ones, diverging from traditional methods that maintain a uniform KV cache size. Our experimental evaluations, utilizing the LongBench benchmark, show that PyramidKV matches the performance of models with a full KV cache while retaining only 12% of the KV cache, thus significantly reducing memory usage. In scenarios emphasizing memory efficiency, where only 0.7% of the KV cache is maintained, PyramidKV surpasses other KV cache compression techniques, achieving up to a 20.5 absolute accuracy improvement on TREC dataset. In the Needle-in-a-Haystack experiment, PyramidKV outperforms competing methods in maintaining long-context comprehension in LLMs; notably, retaining just 128 KV cache entries enables the LLAMA-3-70B model to achieve 100% Acc. performance, matching that of a full KV cache. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02069v3-abstract-full').style.display = 'none'; document.getElementById('2406.02069v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17051">arXiv:2405.17051</a> <span> [<a href="https://arxiv.org/pdf/2405.17051">pdf</a>, <a href="https://arxiv.org/format/2405.17051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BeamVQ: Aligning Space-Time Forecasting Model via Self-training on Physics-aware Metrics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xingjian Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyue Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Penghao Zhao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jinbao Xue</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yangyu Tao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaomeng Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17051v1-abstract-short" style="display: inline;"> Data-driven deep learning has emerged as the new paradigm to model complex physical space-time systems. These data-driven methods learn patterns by optimizing statistical metrics and tend to overlook the adherence to physical laws, unlike traditional model-driven numerical methods. Thus, they often generate predictions that are not physically realistic. On the other hand, by sampling a large amoun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17051v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17051v1-abstract-full" style="display: none;"> Data-driven deep learning has emerged as the new paradigm to model complex physical space-time systems. These data-driven methods learn patterns by optimizing statistical metrics and tend to overlook the adherence to physical laws, unlike traditional model-driven numerical methods. Thus, they often generate predictions that are not physically realistic. On the other hand, by sampling a large amount of high quality predictions from a data-driven model, some predictions will be more physically plausible than the others and closer to what will happen in the future. Based on this observation, we propose \emph{Beam search by Vector Quantization} (BeamVQ) to enhance the physical alignment of data-driven space-time forecasting models. The key of BeamVQ is to train model on self-generated samples filtered with physics-aware metrics. To be flexibly support different backbone architectures, BeamVQ leverages a code bank to transform any encoder-decoder model to the continuous state space into discrete codes. Afterwards, it iteratively employs beam search to sample high-quality sequences, retains those with the highest physics-aware scores, and trains model on the new dataset. Comprehensive experiments show that BeamVQ not only gave an average statistical skill score boost for more than 32% for ten backbones on five datasets, but also significantly enhances physics-aware metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17051v1-abstract-full').style.display = 'none'; document.getElementById('2405.17051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10610">arXiv:2405.10610</a> <span> [<a href="https://arxiv.org/pdf/2405.10610">pdf</a>, <a href="https://arxiv.org/format/2405.10610">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Vision-Language Pretrained Models with Temporal-Aware Adaptation for Referring Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zikun Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wentao Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhenyu He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10610v2-abstract-short" style="display: inline;"> The crux of Referring Video Object Segmentation (RVOS) lies in modeling dense text-video relations to associate abstract linguistic concepts with dynamic visual contents at pixel-level. Current RVOS methods typically use vision and language models pretrained independently as backbones. As images and texts are mapped to uncoupled feature spaces, they face the arduous task of learning Vision-Languag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10610v2-abstract-full').style.display = 'inline'; document.getElementById('2405.10610v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10610v2-abstract-full" style="display: none;"> The crux of Referring Video Object Segmentation (RVOS) lies in modeling dense text-video relations to associate abstract linguistic concepts with dynamic visual contents at pixel-level. Current RVOS methods typically use vision and language models pretrained independently as backbones. As images and texts are mapped to uncoupled feature spaces, they face the arduous task of learning Vision-Language (VL) relation modeling from scratch. Witnessing the success of Vision-Language Pretrained (VLP) models, we propose to learn relation modeling for RVOS based on their aligned VL feature space. Nevertheless, transferring VLP models to RVOS is a deceptively challenging task due to the substantial gap between the pretraining task (static image/region-level prediction) and the RVOS task (dynamic pixel-level prediction). To address this transfer challenge, we introduce a framework named VLP-RVOS which harnesses VLP models for RVOS through temporal-aware adaptation. We first propose a temporal-aware prompt-tuning method, which not only adapts pretrained representations for pixel-level prediction but also empowers the vision encoder to model temporal contexts. We further customize a cube-frame attention mechanism for robust spatial-temporal reasoning. Besides, we propose to perform multi-stage VL relation modeling while and after feature extraction for comprehensive VL understanding. Extensive experiments demonstrate that our method performs favorably against state-of-the-art algorithms and exhibits strong generalization abilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10610v2-abstract-full').style.display = 'none'; document.getElementById('2405.10610v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07863">arXiv:2405.07863</a> <span> [<a href="https://arxiv.org/pdf/2405.07863">pdf</a>, <a href="https://arxiv.org/format/2405.07863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> RLHF Workflow: From Reward Modeling to Online RLHF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanze Dong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+D">Doyen Sahoo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07863v3-abstract-short" style="display: inline;"> We present the workflow of Online Iterative Reinforcement Learning from Human Feedback (RLHF) in this technical report, which is widely reported to outperform its offline counterpart by a large margin in the recent large language model (LLM) literature. However, existing open-source RLHF projects are still largely confined to the offline learning setting. In this technical report, we aim to fill i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07863v3-abstract-full').style.display = 'inline'; document.getElementById('2405.07863v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07863v3-abstract-full" style="display: none;"> We present the workflow of Online Iterative Reinforcement Learning from Human Feedback (RLHF) in this technical report, which is widely reported to outperform its offline counterpart by a large margin in the recent large language model (LLM) literature. However, existing open-source RLHF projects are still largely confined to the offline learning setting. In this technical report, we aim to fill in this gap and provide a detailed recipe that is easy to reproduce for online iterative RLHF. In particular, since online human feedback is usually infeasible for open-source communities with limited resources, we start by constructing preference models using a diverse set of open-source datasets and use the constructed proxy preference model to approximate human feedback. Then, we discuss the theoretical insights and algorithmic principles behind online iterative RLHF, followed by a detailed practical implementation. Our trained LLM achieves impressive performance on LLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as well as other academic benchmarks such as HumanEval and TruthfulQA. We have shown that supervised fine-tuning (SFT) and iterative RLHF can obtain state-of-the-art performance with fully open-source datasets. Further, we have made our models, curated datasets, and comprehensive step-by-step code guidebooks publicly available. Please refer to https://github.com/RLHFlow/RLHF-Reward-Modeling and https://github.com/RLHFlow/Online-RLHF for more detailed information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07863v3-abstract-full').style.display = 'none'; document.getElementById('2405.07863v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Transactions on Machine Learning Research (09/2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05193">arXiv:2405.05193</a> <span> [<a href="https://arxiv.org/pdf/2405.05193">pdf</a>, <a href="https://arxiv.org/format/2405.05193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Systematic Use of Random Self-Reducibility against Physical Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Erata%2C+F">Ferhat Erata</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+T">TingHung Chiu</a>, <a href="/search/cs?searchtype=author&query=Etim%2C+A">Anthony Etim</a>, <a href="/search/cs?searchtype=author&query=Nampally%2C+S">Srilalith Nampally</a>, <a href="/search/cs?searchtype=author&query=Raju%2C+T">Tejas Raju</a>, <a href="/search/cs?searchtype=author&query=Ramu%2C+R">Rajashree Ramu</a>, <a href="/search/cs?searchtype=author&query=Piskac%2C+R">Ruzica Piskac</a>, <a href="/search/cs?searchtype=author&query=Antonopoulos%2C+T">Timos Antonopoulos</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenjie Xiong</a>, <a href="/search/cs?searchtype=author&query=Szefer%2C+J">Jakub Szefer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05193v1-abstract-short" style="display: inline;"> This work presents a novel, black-box software-based countermeasure against physical attacks including power side-channel and fault-injection attacks. The approach uses the concept of random self-reducibility and self-correctness to add randomness and redundancy in the execution for protection. Our approach is at the operation level, is not algorithm-specific, and thus, can be applied for protecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05193v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05193v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05193v1-abstract-full" style="display: none;"> This work presents a novel, black-box software-based countermeasure against physical attacks including power side-channel and fault-injection attacks. The approach uses the concept of random self-reducibility and self-correctness to add randomness and redundancy in the execution for protection. Our approach is at the operation level, is not algorithm-specific, and thus, can be applied for protecting a wide range of algorithms. The countermeasure is empirically evaluated against attacks over operations like modular exponentiation, modular multiplication, polynomial multiplication, and number theoretic transforms. An end-to-end implementation of this countermeasure is demonstrated for RSA-CRT signature algorithm and Kyber Key Generation public key cryptosystems. The countermeasure reduced the power side-channel leakage by two orders of magnitude, to an acceptably secure level in TVLA analysis. For fault injection, the countermeasure reduces the number of faults to 95.4% in average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05193v1-abstract-full').style.display = 'none'; document.getElementById('2405.05193v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05164">arXiv:2405.05164</a> <span> [<a href="https://arxiv.org/pdf/2405.05164">pdf</a>, <a href="https://arxiv.org/format/2405.05164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProbRadarM3F: mmWave Radar based Human Skeletal Pose Estimation with Probability Map Guided Multi-Format Feature Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bing Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zixin He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weiyi Xiong</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guanhua Ding</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianan Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tao Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+W">Wei Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05164v2-abstract-short" style="display: inline;"> Millimeter wave (mmWave) radar is a non-intrusive privacy and relatively convenient and inexpensive device, which has been demonstrated to be applicable in place of RGB cameras in human indoor pose estimation tasks. However, mmWave radar relies on the collection of reflected signals from the target, and the radar signals containing information is difficult to be fully applied. This has been a long… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05164v2-abstract-full').style.display = 'inline'; document.getElementById('2405.05164v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05164v2-abstract-full" style="display: none;"> Millimeter wave (mmWave) radar is a non-intrusive privacy and relatively convenient and inexpensive device, which has been demonstrated to be applicable in place of RGB cameras in human indoor pose estimation tasks. However, mmWave radar relies on the collection of reflected signals from the target, and the radar signals containing information is difficult to be fully applied. This has been a long-standing hindrance to the improvement of pose estimation accuracy. To address this major challenge, this paper introduces a probability map guided multi-format feature fusion model, ProbRadarM3F. This is a novel radar feature extraction framework using a traditional FFT method in parallel with a probability map based positional encoding method. ProbRadarM3F fuses the traditional heatmap features and the positional features, then effectively achieves the estimation of 14 keypoints of the human body. Experimental evaluation on the HuPR dataset proves the effectiveness of the model proposed in this paper, outperforming other methods experimented on this dataset with an AP of 69.9 %. The emphasis of our study is focusing on the position information that is not exploited before in radar singal. This provides direction to investigate other potential non-redundant information from mmWave rader. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05164v2-abstract-full').style.display = 'none'; document.getElementById('2405.05164v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01525">arXiv:2405.01525</a> <span> [<a href="https://arxiv.org/pdf/2405.01525">pdf</a>, <a href="https://arxiv.org/format/2405.01525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FLAME: Factuality-Aware Alignment for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+S">Sheng-Chieh Lin</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Luyu Gao</a>, <a href="/search/cs?searchtype=author&query=Oguz%2C+B">Barlas Oguz</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenhan Xiong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01525v1-abstract-short" style="display: inline;"> Alignment is a standard procedure to fine-tune pre-trained large language models (LLMs) to follow natural language instructions and serve as helpful AI assistants. We have observed, however, that the conventional alignment process fails to enhance the factual accuracy of LLMs, and often leads to the generation of more false facts (i.e. hallucination). In this paper, we study how to make the LLM al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01525v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01525v1-abstract-full" style="display: none;"> Alignment is a standard procedure to fine-tune pre-trained large language models (LLMs) to follow natural language instructions and serve as helpful AI assistants. We have observed, however, that the conventional alignment process fails to enhance the factual accuracy of LLMs, and often leads to the generation of more false facts (i.e. hallucination). In this paper, we study how to make the LLM alignment process more factual, by first identifying factors that lead to hallucination in both alignment steps:\ supervised fine-tuning (SFT) and reinforcement learning (RL). In particular, we find that training the LLM on new knowledge or unfamiliar texts can encourage hallucination. This makes SFT less factual as it trains on human labeled data that may be novel to the LLM. Furthermore, reward functions used in standard RL can also encourage hallucination, because it guides the LLM to provide more helpful responses on a diverse set of instructions, often preferring longer and more detailed responses. Based on these observations, we propose factuality-aware alignment, comprised of factuality-aware SFT and factuality-aware RL through direct preference optimization. Experiments show that our proposed factuality-aware alignment guides LLMs to output more factual responses while maintaining instruction-following capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01525v1-abstract-full').style.display = 'none'; document.getElementById('2405.01525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18922">arXiv:2404.18922</a> <span> [<a href="https://arxiv.org/pdf/2404.18922">pdf</a>, <a href="https://arxiv.org/format/2404.18922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> DPO Meets PPO: Reinforced Token Optimization for RLHF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Han Zhong</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zikang Shan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+G">Guhao Feng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinle Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Li Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Di He</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18922v3-abstract-short" style="display: inline;"> In the classical Reinforcement Learning from Human Feedback (RLHF) framework, Proximal Policy Optimization (PPO) is employed to learn from sparse, sentence-level rewards -- a challenging scenario in traditional deep reinforcement learning. Despite the great successes of PPO in the alignment of large language models, its open-source implementation is still largely sub-optimal. To address these issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18922v3-abstract-full').style.display = 'inline'; document.getElementById('2404.18922v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18922v3-abstract-full" style="display: none;"> In the classical Reinforcement Learning from Human Feedback (RLHF) framework, Proximal Policy Optimization (PPO) is employed to learn from sparse, sentence-level rewards -- a challenging scenario in traditional deep reinforcement learning. Despite the great successes of PPO in the alignment of large language models, its open-source implementation is still largely sub-optimal. To address these issues, we introduce a framework that models RLHF problems as a Markov decision process (MDP), enabling the capture of fine-grained token-wise information. Under this framework, we introduce an algorithm Reinforced Token Optimization (\texttt{RTO}), which learns the token-wise reward function from preference data and performs policy optimization based on this learned token-wise reward signal. Theoretically, \texttt{RTO} is proven to have the capability of finding the near-optimal policy sample-efficiently. For its practical implementation, \texttt{RTO} innovatively integrates Direct Preference Optimization (DPO) and PPO. DPO, originally derived from sparse sentence rewards, surprisingly provides us with a token-wise characterization of response quality, which is seamlessly incorporated into our subsequent PPO training stage. Extensive experiments demonstrate that \texttt{RTO} performs better than PPO and other direct preference learning algorithms. In particular, RTO outperforms PPO by 7.5 points on the AlpacaEval 2 benchmark and by 4.1 points on Arena-Hard. Our code and models are available at \href{https://github.com/zkshan2002/RTO}{https://github.com/zkshan2002/RTO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18922v3-abstract-full').style.display = 'none'; document.getElementById('2404.18922v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08801">arXiv:2404.08801</a> <span> [<a href="https://arxiv.org/pdf/2404.08801">pdf</a>, <a href="https://arxiv.org/format/2404.08801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Megalodon: Efficient LLM Pretraining and Inference with Unlimited Context Length </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xuezhe Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenhan Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=May%2C+J">Jonathan May</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08801v2-abstract-short" style="display: inline;"> The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08801v2-abstract-full').style.display = 'inline'; document.getElementById('2404.08801v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08801v2-abstract-full" style="display: none;"> The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited context length. Megalodon inherits the architecture of Mega (exponential moving average with gated attention), and further introduces multiple technical components to improve its capability and stability, including complex exponential moving average (CEMA), timestep normalization layer, normalized attention mechanism and pre-norm with two-hop residual configuration. In a controlled head-to-head comparison with Llama2, Megalodon achieves better efficiency than Transformer in the scale of 7 billion parameters and 2 trillion training tokens. Megalodon reaches a training loss of 1.70, landing mid-way between Llama2-7B (1.75) and 13B (1.67). Code: https://github.com/XuezheMax/megalodon <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08801v2-abstract-full').style.display = 'none'; document.getElementById('2404.08801v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures and 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05717">arXiv:2404.05717</a> <span> [<a href="https://arxiv.org/pdf/2404.05717">pdf</a>, <a href="https://arxiv.org/format/2404.05717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jing Gu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+N">Nanxuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+H">HyunJoon Jung</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yilin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+E">Xin Eric Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05717v3-abstract-short" style="display: inline;"> Effective editing of personal content holds a pivotal role in enabling individuals to express their creativity, weaving captivating narratives within their visual stories, and elevate the overall quality and impact of their visual content. Therefore, in this work, we introduce SwapAnything, a novel framework that can swap any objects in an image with personalized concepts given by the reference, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05717v3-abstract-full').style.display = 'inline'; document.getElementById('2404.05717v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05717v3-abstract-full" style="display: none;"> Effective editing of personal content holds a pivotal role in enabling individuals to express their creativity, weaving captivating narratives within their visual stories, and elevate the overall quality and impact of their visual content. Therefore, in this work, we introduce SwapAnything, a novel framework that can swap any objects in an image with personalized concepts given by the reference, while keeping the context unchanged. Compared with existing methods for personalized subject swapping, SwapAnything has three unique advantages: (1) precise control of arbitrary objects and parts rather than the main subject, (2) more faithful preservation of context pixels, (3) better adaptation of the personalized concept to the image. First, we propose targeted variable swapping to apply region control over latent feature maps and swap masked variables for faithful context preservation and initial semantic concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt the semantic concept into the original image in terms of target location, shape, style, and content during the image generation process. Extensive results on both human and automatic evaluation demonstrate significant improvements of our approach over baseline methods on personalized swapping. Furthermore, SwapAnything shows its precise and faithful swapping abilities across single object, multiple objects, partial object, and cross-domain swapping tasks. SwapAnything also achieves great performance on text-based swapping and tasks beyond swapping such as object insertion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05717v3-abstract-full').style.display = 'none'; document.getElementById('2404.05717v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024, 23 pages, 14 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11401">arXiv:2403.11401</a> <span> [<a href="https://arxiv.org/pdf/2403.11401">pdf</a>, <a href="https://arxiv.org/format/2403.11401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scene-LLM: Extending Language Model for 3D Visual Understanding and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+R">Rao Fu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilun Chen</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yixin Nie</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenhan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11401v2-abstract-short" style="display: inline;"> This paper introduces Scene-LLM, a 3D-visual-language model that enhances embodied agents' abilities in interactive 3D indoor environments by integrating the reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a hybrid 3D visual feature representation, that incorporates dense spatial information and supports scene state updates. The model employs a projection layer to efficiently… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11401v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11401v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11401v2-abstract-full" style="display: none;"> This paper introduces Scene-LLM, a 3D-visual-language model that enhances embodied agents' abilities in interactive 3D indoor environments by integrating the reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a hybrid 3D visual feature representation, that incorporates dense spatial information and supports scene state updates. The model employs a projection layer to efficiently project these features in the pre-trained textual embedding space, enabling effective interpretation of 3D visual information. Unique to our approach is the integration of both scene-level and ego-centric 3D information. This combination is pivotal for interactive planning, where scene-level data supports global planning and ego-centric data is important for localization. Notably, we use ego-centric 3D frame features for feature alignment, an efficient technique that enhances the model's ability to align features of small objects within the scene. Our experiments with Scene-LLM demonstrate its strong capabilities in dense captioning, question answering, and interactive planning. We believe Scene-LLM advances the field of 3D visual understanding and reasoning, offering new possibilities for sophisticated agent interactions in indoor settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11401v2-abstract-full').style.display = 'none'; document.getElementById('2403.11401v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10701">arXiv:2403.10701</a> <span> [<a href="https://arxiv.org/pdf/2403.10701">pdf</a>, <a href="https://arxiv.org/format/2403.10701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IMPRINT: Generative Object Compositing by Learning Identity-Preserving Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yizhi Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+S">Scott Cohen</a>, <a href="/search/cs?searchtype=author&query=Price%2C+B">Brian Price</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Aliaga%2C+D">Daniel Aliaga</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10701v1-abstract-short" style="display: inline;"> Generative object compositing emerges as a promising new avenue for compositional image editing. However, the requirement of object identity preservation poses a significant challenge, limiting practical usage of most existing methods. In response, this paper introduces IMPRINT, a novel diffusion-based generative model trained with a two-stage learning framework that decouples learning of identity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10701v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10701v1-abstract-full" style="display: none;"> Generative object compositing emerges as a promising new avenue for compositional image editing. However, the requirement of object identity preservation poses a significant challenge, limiting practical usage of most existing methods. In response, this paper introduces IMPRINT, a novel diffusion-based generative model trained with a two-stage learning framework that decouples learning of identity preservation from that of compositing. The first stage is targeted for context-agnostic, identity-preserving pretraining of the object encoder, enabling the encoder to learn an embedding that is both view-invariant and conducive to enhanced detail preservation. The subsequent stage leverages this representation to learn seamless harmonization of the object composited to the background. In addition, IMPRINT incorporates a shape-guidance mechanism offering user-directed control over the compositing process. Extensive experiments demonstrate that IMPRINT significantly outperforms existing methods and various baselines on identity preservation and composition quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10701v1-abstract-full').style.display = 'none'; document.getElementById('2403.10701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08730">arXiv:2403.08730</a> <span> [<a href="https://arxiv.org/pdf/2403.08730">pdf</a>, <a href="https://arxiv.org/format/2403.08730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Strengthening Multimodal Large Language Model with Bootstrapped Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pi%2C+R">Renjie Pi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tianyang Han</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Runtao Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+R">Rui Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08730v2-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) excel in generating responses based on visual inputs. However, they often suffer from a bias towards generating responses similar to their pretraining corpus, overshadowing the importance of visual information. We treat this bias as a "preference" for pretraining statistics, which hinders the model's grounding in visual input. To mitigate this issue, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08730v2-abstract-full').style.display = 'inline'; document.getElementById('2403.08730v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08730v2-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) excel in generating responses based on visual inputs. However, they often suffer from a bias towards generating responses similar to their pretraining corpus, overshadowing the importance of visual information. We treat this bias as a "preference" for pretraining statistics, which hinders the model's grounding in visual input. To mitigate this issue, we propose Bootstrapped Preference Optimization (BPO), which conducts preference learning with datasets containing negative responses bootstrapped from the model itself. Specifically, we propose the following two strategies: 1) using distorted image inputs to the MLLM for eliciting responses that contain signified pretraining bias; 2) leveraging text-based LLM to explicitly inject erroneous but common elements into the original response. Those undesirable responses are paired with original annotated responses from the datasets to construct the preference dataset, which is subsequently utilized to perform preference learning. Our approach effectively suppresses pretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive experimentation demonstrates significant performance improvements across multiple benchmarks, advancing the state-of-the-art in multimodal conversational systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08730v2-abstract-full').style.display = 'none'; document.getElementById('2403.08730v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18571">arXiv:2402.18571</a> <span> [<a href="https://arxiv.org/pdf/2402.18571">pdf</a>, <a href="https://arxiv.org/format/2402.18571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Arithmetic Control of LLMs for Diverse User Preferences: Directional Preference Alignment with Multi-Objective Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Rui Yang</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+S">Shizhe Diao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+S">Shuang Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18571v3-abstract-short" style="display: inline;"> Fine-grained control over large language models (LLMs) remains a significant challenge, hindering their adaptability to diverse user needs. While Reinforcement Learning from Human Feedback (RLHF) shows promise in aligning LLMs, its reliance on scalar rewards often limits its ability to capture diverse user preferences in real-world applications. To address this limitation, we introduce the Directi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18571v3-abstract-full').style.display = 'inline'; document.getElementById('2402.18571v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18571v3-abstract-full" style="display: none;"> Fine-grained control over large language models (LLMs) remains a significant challenge, hindering their adaptability to diverse user needs. While Reinforcement Learning from Human Feedback (RLHF) shows promise in aligning LLMs, its reliance on scalar rewards often limits its ability to capture diverse user preferences in real-world applications. To address this limitation, we introduce the Directional Preference Alignment (DPA) framework. Unlike the scalar-reward RLHF, DPA incorporates multi-objective reward modeling to represent diverse preference profiles. Additionally, DPA models user preferences as directions (i.e., unit vectors) in the reward space to achieve user-dependent preference control. Our method involves training a multi-objective reward model and then fine-tuning the LLM with a preference-conditioned variant of Rejection Sampling Finetuning (RSF), an RLHF method adopted by Llama 2. This method enjoys a better performance trade-off across various reward objectives. In comparison with the scalar-reward RLHF, DPA offers users intuitive control over LLM generation: they can arithmetically specify their desired trade-offs (e.g., more helpfulness with less verbosity). We also validate the effectiveness of DPA with real-world alignment experiments on Mistral-7B. Our method provides straightforward arithmetic control over the trade-off between helpfulness and verbosity while maintaining competitive performance with strong baselines such as Direct Preference Optimization (DPO). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18571v3-abstract-full').style.display = 'none'; document.getElementById('2402.18571v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and model are released at https://github.com/Haoxiang-Wang/directional-preference-alignment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18264">arXiv:2402.18264</a> <span> [<a href="https://arxiv.org/pdf/2402.18264">pdf</a>, <a href="https://arxiv.org/format/2402.18264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WIKIGENBENCH: Exploring Full-length Wikipedia Generation under Real-World Scenario </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiebin Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+E+J">Eugene J. Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qinyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenhao Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+H">Han Qian</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingbo Song</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weimin Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoguang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sujian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18264v2-abstract-short" style="display: inline;"> It presents significant challenges to generate comprehensive and accurate Wikipedia articles for newly emerging events under a real-world scenario. Existing attempts fall short either by focusing only on short snippets or by using metrics that are insufficient to evaluate real-world scenarios. In this paper, we construct WIKIGENBENCH, a new benchmark consisting of 1,320 entries, designed to align… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18264v2-abstract-full').style.display = 'inline'; document.getElementById('2402.18264v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18264v2-abstract-full" style="display: none;"> It presents significant challenges to generate comprehensive and accurate Wikipedia articles for newly emerging events under a real-world scenario. Existing attempts fall short either by focusing only on short snippets or by using metrics that are insufficient to evaluate real-world scenarios. In this paper, we construct WIKIGENBENCH, a new benchmark consisting of 1,320 entries, designed to align with real-world scenarios in both generation and evaluation. For generation, we explore a real-world scenario where structured, full-length Wikipedia articles with citations are generated for new events using input documents from web sources. For evaluation, we integrate systematic metrics and LLM-based metrics to assess the verifiability, organization, and other aspects aligned with real-world scenarios. Based on this benchmark, we conduct extensive experiments using various models within three commonly used frameworks: direct RAG, hierarchical structure-based RAG, and RAG with a fine-tuned generation model. Experimental results show that hierarchical-based methods can generate more comprehensive content, while fine-tuned methods achieve better verifiability. However, even the best methods still show a significant gap compared to existing Wikipedia content, indicating that further research is necessary. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18264v2-abstract-full').style.display = 'none'; document.getElementById('2402.18264v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLING 2025 Camera Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17525">arXiv:2402.17525</a> <span> [<a href="https://arxiv.org/pdf/2402.17525">pdf</a>, <a href="https://arxiv.org/format/2402.17525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Model-Based Image Editing: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiancheng Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingfu Yan</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiaxi Lv</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianzhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+L">Liangliang Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17525v2-abstract-short" style="display: inline;"> Denoising diffusion models have emerged as a powerful tool for various image generation and editing tasks, facilitating the synthesis of visual content in an unconditional or input-conditional manner. The core idea behind them is learning to reverse the process of gradually adding noise to images, allowing them to generate high-quality samples from a complex distribution. In this survey, we provid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17525v2-abstract-full').style.display = 'inline'; document.getElementById('2402.17525v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17525v2-abstract-full" style="display: none;"> Denoising diffusion models have emerged as a powerful tool for various image generation and editing tasks, facilitating the synthesis of visual content in an unconditional or input-conditional manner. The core idea behind them is learning to reverse the process of gradually adding noise to images, allowing them to generate high-quality samples from a complex distribution. In this survey, we provide an exhaustive overview of existing methods using diffusion models for image editing, covering both theoretical and practical aspects in the field. We delve into a thorough analysis and categorization of these works from multiple perspectives, including learning strategies, user-input conditions, and the array of specific editing tasks that can be accomplished. In addition, we pay special attention to image inpainting and outpainting, and explore both earlier traditional context-driven and current multimodal conditional methods, offering a comprehensive analysis of their methodologies. To further evaluate the performance of text-guided image editing algorithms, we propose a systematic benchmark, EditEval, featuring an innovative metric, LMM Score. Finally, we address current limitations and envision some potential directions for future research. The accompanying repository is released at https://github.com/SiatMMLab/Awesome-Diffusion-Model-Based-Image-Editing-Methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17525v2-abstract-full').style.display = 'none'; document.getElementById('2402.17525v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07314">arXiv:2402.07314</a> <span> [<a href="https://arxiv.org/pdf/2402.07314">pdf</a>, <a href="https://arxiv.org/ps/2402.07314">ps</a>, <a href="https://arxiv.org/format/2402.07314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Online Iterative Reinforcement Learning from Human Feedback with General Preference Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chenlu Ye</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanze Dong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07314v3-abstract-short" style="display: inline;"> We investigate Reinforcement Learning from Human Feedback (RLHF) in the context of a general preference oracle. In particular, we do not assume the existence of a reward function and an oracle preference signal drawn from the Bradley-Terry model as most of the prior works do. We consider a standard mathematical formulation, the reverse-KL regularized minimax game between two LLMs for RLHF under ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07314v3-abstract-full').style.display = 'inline'; document.getElementById('2402.07314v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07314v3-abstract-full" style="display: none;"> We investigate Reinforcement Learning from Human Feedback (RLHF) in the context of a general preference oracle. In particular, we do not assume the existence of a reward function and an oracle preference signal drawn from the Bradley-Terry model as most of the prior works do. We consider a standard mathematical formulation, the reverse-KL regularized minimax game between two LLMs for RLHF under general preference oracle. The learning objective of this formulation is to find a policy so that it is consistently preferred by the KL-regularized preference oracle over any competing LLMs. We show that this framework is strictly more general than the reward-based one, and propose sample-efficient algorithms for both the offline learning from a pre-collected preference dataset and online learning where we can query the preference oracle along the way of training. Empirical studies verify the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07314v3-abstract-full').style.display = 'none'; document.getElementById('2402.07314v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">RLHF, Preference Learning, Alignment for LLMs</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.02831">arXiv:2401.02831</a> <span> [<a href="https://arxiv.org/pdf/2401.02831">pdf</a>, <a href="https://arxiv.org/format/2401.02831">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Two-stage Progressive Residual Dense Attention Network for Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wencong Wu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+A">An Ge</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+G">Guannan Lv</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yuelong Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yungang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wen Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.02831v1-abstract-short" style="display: inline;"> Deep convolutional neural networks (CNNs) for image denoising can effectively exploit rich hierarchical features and have achieved great success. However, many deep CNN-based denoising models equally utilize the hierarchical features of noisy images without paying attention to the more important and useful features, leading to relatively low performance. To address the issue, we design a new Two-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.02831v1-abstract-full').style.display = 'inline'; document.getElementById('2401.02831v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.02831v1-abstract-full" style="display: none;"> Deep convolutional neural networks (CNNs) for image denoising can effectively exploit rich hierarchical features and have achieved great success. However, many deep CNN-based denoising models equally utilize the hierarchical features of noisy images without paying attention to the more important and useful features, leading to relatively low performance. To address the issue, we design a new Two-stage Progressive Residual Dense Attention Network (TSP-RDANet) for image denoising, which divides the whole process of denoising into two sub-tasks to remove noise progressively. Two different attention mechanism-based denoising networks are designed for the two sequential sub-tasks: the residual dense attention module (RDAM) is designed for the first stage, and the hybrid dilated residual dense attention module (HDRDAM) is proposed for the second stage. The proposed attention modules are able to learn appropriate local features through dense connection between different convolutional layers, and the irrelevant features can also be suppressed. The two sub-networks are then connected by a long skip connection to retain the shallow feature to enhance the denoising performance. The experiments on seven benchmark datasets have verified that compared with many state-of-the-art methods, the proposed TSP-RDANet can obtain favorable results both on synthetic and real noisy image denoising. The code of our TSP-RDANet is available at https://github.com/WenCongWu/TSP-RDANet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.02831v1-abstract-full').style.display = 'none'; document.getElementById('2401.02831v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15124">arXiv:2312.15124</a> <span> [<a href="https://arxiv.org/pdf/2312.15124">pdf</a>, <a href="https://arxiv.org/format/2312.15124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On fundamental aspects of quantum extreme learning machines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Weijie Xiong</a>, <a href="/search/cs?searchtype=author&query=Facelli%2C+G">Giorgio Facelli</a>, <a href="/search/cs?searchtype=author&query=Sahebi%2C+M">Mehrad Sahebi</a>, <a href="/search/cs?searchtype=author&query=Agnel%2C+O">Owen Agnel</a>, <a href="/search/cs?searchtype=author&query=Chotibut%2C+T">Thiparat Chotibut</a>, <a href="/search/cs?searchtype=author&query=Thanasilp%2C+S">Supanut Thanasilp</a>, <a href="/search/cs?searchtype=author&query=Holmes%2C+Z">Zo毛 Holmes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15124v2-abstract-short" style="display: inline;"> Quantum Extreme Learning Machines (QELMs) have emerged as a promising framework for quantum machine learning. Their appeal lies in the rich feature map induced by the dynamics of a quantum substrate - the quantum reservoir - and the efficient post-measurement training via linear regression. Here we study the expressivity of QELMs by decomposing the prediction of QELMs into a Fourier series. We sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15124v2-abstract-full').style.display = 'inline'; document.getElementById('2312.15124v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15124v2-abstract-full" style="display: none;"> Quantum Extreme Learning Machines (QELMs) have emerged as a promising framework for quantum machine learning. Their appeal lies in the rich feature map induced by the dynamics of a quantum substrate - the quantum reservoir - and the efficient post-measurement training via linear regression. Here we study the expressivity of QELMs by decomposing the prediction of QELMs into a Fourier series. We show that the achievable Fourier frequencies are determined by the data encoding scheme, while Fourier coefficients depend on both the reservoir and the measurement. Notably, the expressivity of QELMs is fundamentally limited by the number of Fourier frequencies and the number of observables, while the complexity of the prediction hinges on the reservoir. As a cautionary note on scalability, we identify four sources that can lead to the exponential concentration of the observables as the system size grows (randomness, hardware noise, entanglement, and global measurements) and show how this can turn QELMs into useless input-agnostic oracles. In particular, our result on the reservoir-induced concentration strongly indicates that quantum reservoirs drawn from a highly random ensemble make QELM models unscalable. Our analysis elucidates the potential and fundamental limitations of QELMs, and lays the groundwork for systematically exploring quantum reservoir systems for other machine learning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15124v2-abstract-full').style.display = 'none'; document.getElementById('2312.15124v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20+21 pages, 9+2 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository