Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 470 results for author: <span class="mathjax">Xiong, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xiong%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xiong, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xiong%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xiong, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15543">arXiv:2502.15543</a> <span> [<a href="https://arxiv.org/pdf/2502.15543">pdf</a>, <a href="https://arxiv.org/format/2502.15543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PIP-KAG: Mitigating Knowledge Conflicts in Knowledge-Augmented Generation via Parametric Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+P">Pengcheng Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xiaoyuan Yi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Ge Yu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15543v1-abstract-short" style="display: inline;"> Knowledge-Augmented Generation (KAG) has shown great promise in updating the internal memory of Large Language Models (LLMs) by integrating external knowledge. However, KAG inevitably faces knowledge conflicts when the internal memory contradicts external information. Current approaches to mitigating these conflicts mainly focus on improving external knowledge utilization. However, these methods h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15543v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15543v1-abstract-full" style="display: none;"> Knowledge-Augmented Generation (KAG) has shown great promise in updating the internal memory of Large Language Models (LLMs) by integrating external knowledge. However, KAG inevitably faces knowledge conflicts when the internal memory contradicts external information. Current approaches to mitigating these conflicts mainly focus on improving external knowledge utilization. However, these methods have shown only limited effectiveness in mitigating the knowledge conflict problem, as internal knowledge continues to influence the generation process of LLMs. In this paper, we propose a ParametrIc Pruning-based Knowledge-Augmented Generation (PIP-KAG) approach, which prunes internal knowledge of LLMs and incorporates a plug-and-play adaptation module to help LLMs better leverage external sources. Additionally, we construct the CoConflictQA benchmark based on the hallucination of LLMs to better evaluate contextual faithfulness during answering questions. Experimental results on CoConflictQA demonstrate that PIP-KAG significantly reduces knowledge conflicts and improves context fidelity. Notably, PIP-KAG reduces LLM's parameters by 13%, enhancing parameter efficiency in LLMs within the KAG framework. All codes are available at https://github.com/OpenBMB/PIP-KAG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15543v1-abstract-full').style.display = 'none'; document.getElementById('2502.15543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15226">arXiv:2502.15226</a> <span> [<a href="https://arxiv.org/pdf/2502.15226">pdf</a>, <a href="https://arxiv.org/format/2502.15226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Understand User Opinions of Large Language Models via LLM-Powered In-the-Moment User Experience Interviews </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengqiao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tevin Wang</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+C+A">Cassandra A. Cohen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sarah Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15226v1-abstract-short" style="display: inline;"> Which large language model (LLM) is better? Every evaluation tells a story, but what do users really think about current LLMs? This paper presents CLUE, an LLM-powered interviewer that conducts in-the-moment user experience interviews, right after users interacted with LLMs, and automatically gathers insights about user opinions from massive interview logs. We conduct a study with thousands of use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15226v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15226v1-abstract-full" style="display: none;"> Which large language model (LLM) is better? Every evaluation tells a story, but what do users really think about current LLMs? This paper presents CLUE, an LLM-powered interviewer that conducts in-the-moment user experience interviews, right after users interacted with LLMs, and automatically gathers insights about user opinions from massive interview logs. We conduct a study with thousands of users to understand user opinions on mainstream LLMs, recruiting users to first chat with a target LLM and then interviewed by CLUE. Our experiments demonstrate that CLUE captures interesting user opinions, for example, the bipolar views on the displayed reasoning process of DeepSeek-R1 and demands for information freshness and multi-modality. Our collected chat-and-interview logs will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15226v1-abstract-full').style.display = 'none'; document.getElementById('2502.15226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14709">arXiv:2502.14709</a> <span> [<a href="https://arxiv.org/pdf/2502.14709">pdf</a>, <a href="https://arxiv.org/format/2502.14709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data-Efficient Pretraining with Group-Level Data Influence Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zichun Yu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+F">Fei Peng</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jie Lei</a>, <a href="/search/cs?searchtype=author&query=Overwijk%2C+A">Arnold Overwijk</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14709v1-abstract-short" style="display: inline;"> Data-efficient pretraining has shown tremendous potential to elevate scaling laws. This paper argues that effective pretraining data should be curated at the group level, treating a set of data points as a whole rather than as independent contributors. To achieve that, we propose Group-Level Data Influence Modeling (Group-MATES), a novel data-efficient pretraining method that captures and optimize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14709v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14709v1-abstract-full" style="display: none;"> Data-efficient pretraining has shown tremendous potential to elevate scaling laws. This paper argues that effective pretraining data should be curated at the group level, treating a set of data points as a whole rather than as independent contributors. To achieve that, we propose Group-Level Data Influence Modeling (Group-MATES), a novel data-efficient pretraining method that captures and optimizes group-level data utility. Specifically, Group-MATES collects oracle group-level influences by locally probing the pretraining model with data sets. It then fine-tunes a relational data influence model to approximate oracles as relationship-weighted aggregations of individual influences. The fine-tuned model selects the data subset by maximizing its group-level influence prediction, with influence-aware clustering to enable efficient inference. Experiments on the DCLM benchmark demonstrate that Group-MATES achieves a 10% relative core score improvement on 22 downstream tasks over DCLM-Baseline and 5% over individual-influence-based methods, establishing a new state-of-the-art. Further analyses highlight the effectiveness of relational data influence models in capturing intricate interactions between data points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14709v1-abstract-full').style.display = 'none'; document.getElementById('2502.14709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14619">arXiv:2502.14619</a> <span> [<a href="https://arxiv.org/pdf/2502.14619">pdf</a>, <a href="https://arxiv.org/format/2502.14619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reward Models Identify Consistency, Not Causality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuhui Xu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanze Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junnan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14619v1-abstract-short" style="display: inline;"> Reward models (RMs) play a crucial role in aligning large language models (LLMs) with human preferences and enhancing reasoning quality. Traditionally, RMs are trained to rank candidate outputs based on their correctness and coherence. However, in this work, we present several surprising findings that challenge common assumptions about RM behavior. Our analysis reveals that state-of-the-art reward… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14619v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14619v1-abstract-full" style="display: none;"> Reward models (RMs) play a crucial role in aligning large language models (LLMs) with human preferences and enhancing reasoning quality. Traditionally, RMs are trained to rank candidate outputs based on their correctness and coherence. However, in this work, we present several surprising findings that challenge common assumptions about RM behavior. Our analysis reveals that state-of-the-art reward models prioritize structural consistency over causal correctness. Specifically, removing the problem statement has minimal impact on reward scores, whereas altering numerical values or disrupting the reasoning flow significantly affects RM outputs. Furthermore, RMs exhibit a strong dependence on complete reasoning trajectories truncated or incomplete steps lead to significant variations in reward assignments, indicating that RMs primarily rely on learned reasoning patterns rather than explicit problem comprehension. These findings hold across multiple architectures, datasets, and tasks, leading to three key insights: (1) RMs primarily assess coherence rather than true reasoning quality; (2) The role of explicit problem comprehension in reward assignment is overstated; (3) Current RMs may be more effective at ranking responses than verifying logical validity. Our results suggest a fundamental limitation in existing reward modeling approaches, emphasizing the need for a shift toward causality-aware reward models that go beyond consistency-driven evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14619v1-abstract-full').style.display = 'none'; document.getElementById('2502.14619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14296">arXiv:2502.14296</a> <span> [<a href="https://arxiv.org/pdf/2502.14296">pdf</a>, <a href="https://arxiv.org/format/2502.14296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> On the Trustworthiness of Generative Foundation Models: Guideline, Assessment, and Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chujie Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yujun Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiayi Ye</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiawen Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qihui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tianrui Guan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongping Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruoxi Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+K">Kehan Guo</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+A">Andy Zou</a>, <a href="/search/cs?searchtype=author&query=Kuen-Yew%2C+B+H">Bryan Hooi Kuen-Yew</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Stengel-Eskin%2C+E">Elias Stengel-Eskin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huaxiu Yao</a> , et al. (41 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14296v1-abstract-short" style="display: inline;"> Generative Foundation Models (GenFMs) have emerged as transformative tools. However, their widespread adoption raises critical concerns regarding trustworthiness across dimensions. This paper presents a comprehensive framework to address these challenges through three key contributions. First, we systematically review global AI governance laws and policies from governments and regulatory bodies, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14296v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14296v1-abstract-full" style="display: none;"> Generative Foundation Models (GenFMs) have emerged as transformative tools. However, their widespread adoption raises critical concerns regarding trustworthiness across dimensions. This paper presents a comprehensive framework to address these challenges through three key contributions. First, we systematically review global AI governance laws and policies from governments and regulatory bodies, as well as industry practices and standards. Based on this analysis, we propose a set of guiding principles for GenFMs, developed through extensive multidisciplinary collaboration that integrates technical, ethical, legal, and societal perspectives. Second, we introduce TrustGen, the first dynamic benchmarking platform designed to evaluate trustworthiness across multiple dimensions and model types, including text-to-image, large language, and vision-language models. TrustGen leverages modular components--metadata curation, test case generation, and contextual variation--to enable adaptive and iterative assessments, overcoming the limitations of static evaluation methods. Using TrustGen, we reveal significant progress in trustworthiness while identifying persistent challenges. Finally, we provide an in-depth discussion of the challenges and future directions for trustworthy GenFMs, which reveals the complex, evolving nature of trustworthiness, highlighting the nuanced trade-offs between utility and trustworthiness, and consideration for various downstream applications, identifying persistent challenges and providing a strategic roadmap for future research. This work establishes a holistic framework for advancing trustworthiness in GenAI, paving the way for safer and more responsible integration of GenFMs into critical applications. To facilitate advancement in the community, we release the toolkit for dynamic evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14296v1-abstract-full').style.display = 'none'; document.getElementById('2502.14296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13347">arXiv:2502.13347</a> <span> [<a href="https://arxiv.org/pdf/2502.13347">pdf</a>, <a href="https://arxiv.org/format/2502.13347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Craw4LLM: Efficient Web Crawling for LLM Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13347v1-abstract-short" style="display: inline;"> Web crawl is a main source of large language models' (LLMs) pretraining data, but the majority of crawled web pages are discarded in pretraining due to low data quality. This paper presents Crawl4LLM, an efficient web crawling method that explores the web graph based on the preference of LLM pretraining. Specifically, it leverages the influence of a webpage in LLM pretraining as the priority score… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13347v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13347v1-abstract-full" style="display: none;"> Web crawl is a main source of large language models' (LLMs) pretraining data, but the majority of crawled web pages are discarded in pretraining due to low data quality. This paper presents Crawl4LLM, an efficient web crawling method that explores the web graph based on the preference of LLM pretraining. Specifically, it leverages the influence of a webpage in LLM pretraining as the priority score of the web crawler's scheduler, replacing the standard graph connectivity based priority. Our experiments on a web graph containing 900 million webpages from a commercial search engine's index demonstrate the efficiency of Crawl4LLM in obtaining high-quality pretraining data. With just 21% URLs crawled, LLMs pretrained on Crawl4LLM data reach the same downstream performances of previous crawls, significantly reducing the crawling waste and alleviating the burdens on websites. Our code is publicly available at https://github.com/cxcscmu/Crawl4LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13347v1-abstract-full').style.display = 'none'; document.getElementById('2502.13347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11492">arXiv:2502.11492</a> <span> [<a href="https://arxiv.org/pdf/2502.11492">pdf</a>, <a href="https://arxiv.org/format/2502.11492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kung-Hsiang Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Can Qin</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+H">Haoyi Qiu</a>, <a href="/search/cs?searchtype=author&query=Laban%2C+P">Philippe Laban</a>, <a href="/search/cs?searchtype=author&query=Joty%2C+S">Shafiq Joty</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11492v1-abstract-short" style="display: inline;"> Vision Language Models (VLMs) have achieved remarkable progress in multimodal tasks, yet they often struggle with visual arithmetic, seemingly simple capabilities like object counting or length comparison, which are essential for relevant complex tasks like chart understanding and geometric reasoning. In this work, we first investigate the root causes of this deficiency through a suite of probing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11492v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11492v1-abstract-full" style="display: none;"> Vision Language Models (VLMs) have achieved remarkable progress in multimodal tasks, yet they often struggle with visual arithmetic, seemingly simple capabilities like object counting or length comparison, which are essential for relevant complex tasks like chart understanding and geometric reasoning. In this work, we first investigate the root causes of this deficiency through a suite of probing tasks focusing on basic visual arithmetic. Our analysis reveals that while pre-trained vision encoders typically capture sufficient information, the text decoder often fails to decode it correctly for arithmetic reasoning. To address this, we propose CogAlign, a novel post-training strategy inspired by Piaget's theory of cognitive development. CogAlign trains VLMs to recognize invariant properties under visual transformations. We demonstrate that this approach significantly improves the performance of three diverse VLMs on our proposed probing tasks. Furthermore, CogAlign enhances performance by an average of 4.6% on CHOCOLATE and 2.9% on MATH-VISION, outperforming or matching supervised fine-tuning methods while requiring only 60% less training data. These results highlight the effectiveness and generalizability of CogAlign in improving fundamental visual arithmetic capabilities and their transfer to downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11492v1-abstract-full').style.display = 'none'; document.getElementById('2502.11492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08806">arXiv:2502.08806</a> <span> [<a href="https://arxiv.org/pdf/2502.08806">pdf</a>, <a href="https://arxiv.org/format/2502.08806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CLOVER: A Test Case Generation Benchmark with Coverage, Long-Context, and Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+J">Jin Qu</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+H">Hiroaki Hayashi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08806v1-abstract-short" style="display: inline;"> Software testing is a critical aspect of software development, yet generating test cases remains a routine task for engineers. This paper presents a benchmark, CLOVER, to evaluate models' capabilities in generating and completing test cases under specific conditions. Spanning from simple assertion completions to writing test cases that cover specific code blocks across multiple files, these tasks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08806v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08806v1-abstract-full" style="display: none;"> Software testing is a critical aspect of software development, yet generating test cases remains a routine task for engineers. This paper presents a benchmark, CLOVER, to evaluate models' capabilities in generating and completing test cases under specific conditions. Spanning from simple assertion completions to writing test cases that cover specific code blocks across multiple files, these tasks are based on 12 python repositories, analyzing 845 problems with context lengths ranging from 4k to 128k tokens. Utilizing code testing frameworks, we propose a method to construct retrieval contexts using coverage information. While models exhibit comparable performance with short contexts, notable differences emerge with 16k contexts. Notably, models like GPT-4o and Claude 3.5 can effectively leverage relevant snippets; however, all models score below 35\% on the complex Task III, even with the oracle context provided, underscoring the benchmark's significance and the potential for model improvement. The benchmark is containerized for code execution across tasks, and we will release the code, data, and construction methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08806v1-abstract-full').style.display = 'none'; document.getElementById('2502.08806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06812">arXiv:2502.06812</a> <span> [<a href="https://arxiv.org/pdf/2502.06812">pdf</a>, <a href="https://arxiv.org/format/2502.06812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Harness Local Rewards for Global Benefits: Effective Text-to-Video Generation Alignment with Patch-level Reward Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuting Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haihong Tang</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhicheng Dou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06812v2-abstract-short" style="display: inline;"> The emergence of diffusion models (DMs) has significantly improved the quality of text-to-video generation models (VGMs). However, current VGM optimization primarily emphasizes the global quality of videos, overlooking localized errors, which leads to suboptimal generation capabilities. To address this issue, we propose a post-training strategy for VGMs, HALO, which explicitly incorporates local f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06812v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06812v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06812v2-abstract-full" style="display: none;"> The emergence of diffusion models (DMs) has significantly improved the quality of text-to-video generation models (VGMs). However, current VGM optimization primarily emphasizes the global quality of videos, overlooking localized errors, which leads to suboptimal generation capabilities. To address this issue, we propose a post-training strategy for VGMs, HALO, which explicitly incorporates local feedback from a patch reward model, providing detailed and comprehensive training signals with the video reward model for advanced VGM optimization. To develop an effective patch reward model, we distill GPT-4o to continuously train our video reward model, which enhances training efficiency and ensures consistency between video and patch reward distributions. Furthermore, to harmoniously integrate patch rewards into VGM optimization, we introduce a granular DPO (Gran-DPO) algorithm for DMs, allowing collaborative use of both patch and video rewards during the optimization process. Experimental results indicate that our patch reward model aligns well with human annotations and HALO substantially outperforms the baselines across two evaluation methods. Further experiments quantitatively prove the existence of patch defects, and our proposed method could effectively alleviate this issue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06812v2-abstract-full').style.display = 'none'; document.getElementById('2502.06812v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03860">arXiv:2502.03860</a> <span> [<a href="https://arxiv.org/pdf/2502.03860">pdf</a>, <a href="https://arxiv.org/format/2502.03860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BOLT: Bootstrap Long Chain-of-Thought in Language Models without Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanze Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03860v1-abstract-short" style="display: inline;"> Large language models (LLMs), such as o1 from OpenAI, have demonstrated remarkable reasoning capabilities. o1 generates a long chain-of-thought (LongCoT) before answering a question. LongCoT allows LLMs to analyze problems, devise plans, reflect, and backtrack effectively. These actions empower LLM to solve complex problems. After the release of o1, many teams have attempted to replicate its LongC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03860v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03860v1-abstract-full" style="display: none;"> Large language models (LLMs), such as o1 from OpenAI, have demonstrated remarkable reasoning capabilities. o1 generates a long chain-of-thought (LongCoT) before answering a question. LongCoT allows LLMs to analyze problems, devise plans, reflect, and backtrack effectively. These actions empower LLM to solve complex problems. After the release of o1, many teams have attempted to replicate its LongCoT and reasoning capabilities. In terms of methods, they primarily rely on knowledge distillation with data from existing models with LongCoT capacities (e.g., OpenAI-o1, Qwen-QwQ, DeepSeek-R1-Preview), leaving significant uncertainties on systematically developing such reasoning abilities. In terms of data domains, these works focus narrowly on math while a few others include coding, limiting their generalizability. This paper introduces a novel approach to enable LLM's LongCoT capacity without distillation from o1-like models or expensive human annotations, where we bootstrap LongCoT (BOLT) from a standard instruct model. BOLT involves three stages: 1) LongCoT data bootstrapping with in-context learning on a standard instruct model; 2) LongCoT supervised finetuning; 3) online training to further refine LongCoT capacities. In BOLT, only a few in-context examples need to be constructed during the bootstrapping stage; in our experiments, we created 10 examples, demonstrating the feasibility of this approach. We use Llama-3.1-70B-Instruct to bootstrap LongCoT and apply our method to various model scales (7B, 8B, 70B). We achieve impressive performance on a variety of benchmarks, Arena-Hard, MT-Bench, WildBench, ZebraLogic, MATH500, which evaluate diverse task-solving and reasoning capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03860v1-abstract-full').style.display = 'none'; document.getElementById('2502.03860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00955">arXiv:2502.00955</a> <span> [<a href="https://arxiv.org/pdf/2502.00955">pdf</a>, <a href="https://arxiv.org/format/2502.00955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient Multi-Agent System Training with Data Influence-Oriented Tree Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Wentao Shi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zichun Yu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00955v1-abstract-short" style="display: inline;"> Monte Carlo Tree Search (MCTS) based methods provide promising approaches for generating synthetic data to enhance the self-training of Large Language Model (LLM) based multi-agent systems (MAS). These methods leverage Q-values to estimate individual agent contributions. However, relying solely on Q-values to identify informative data may misalign with the data synthesis objective, as the focus sh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00955v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00955v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00955v1-abstract-full" style="display: none;"> Monte Carlo Tree Search (MCTS) based methods provide promising approaches for generating synthetic data to enhance the self-training of Large Language Model (LLM) based multi-agent systems (MAS). These methods leverage Q-values to estimate individual agent contributions. However, relying solely on Q-values to identify informative data may misalign with the data synthesis objective, as the focus should be on selecting data that best enhances model training. To address this discrepancy, we propose Data Influence-oriented Tree Search (DITS), a novel framework that incorporates influence scores to guide both tree search and data selection. By leveraging influence scores, we effectively identify the most impactful data for system improvement, thereby enhancing model performance. Furthermore, we derive influence score estimation methods tailored for non-differentiable metrics, significantly reducing computational overhead by utilizing inference computations. Extensive experiments on eight multi-agent datasets demonstrate the robustness and effectiveness of the proposed methods. Notably, our findings reveal that allocating more inference resources to estimate influence scores, rather than Q-values, during data synthesis can more effectively and efficiently enhance model training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00955v1-abstract-full').style.display = 'none'; document.getElementById('2502.00955v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00198">arXiv:2502.00198</a> <span> [<a href="https://arxiv.org/pdf/2502.00198">pdf</a>, <a href="https://arxiv.org/format/2502.00198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fairshare Data Pricing for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Luyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+C">Cathy Jiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Beibei Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00198v1-abstract-short" style="display: inline;"> Training data is a pivotal resource for building large language models (LLMs), but unfair pricing in data markets poses a serious challenge for both data buyers (e.g., LLM builders) and sellers (e.g., human annotators), which discourages market participation, reducing data quantity and quality. In this paper, we propose a fairshare pricing framework that sets training data prices using data valuat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00198v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00198v1-abstract-full" style="display: none;"> Training data is a pivotal resource for building large language models (LLMs), but unfair pricing in data markets poses a serious challenge for both data buyers (e.g., LLM builders) and sellers (e.g., human annotators), which discourages market participation, reducing data quantity and quality. In this paper, we propose a fairshare pricing framework that sets training data prices using data valuation methods to quantify their contribution to LLMs. In our framework, buyers make purchasing decisions using data valuation and sellers set prices to maximize their profits based on the anticipated buyer purchases. We theoretically show that pricing derived from our framework is tightly linked to data valuation and buyers' budget, optimal for both buyers and sellers. Through market simulations using current LLMs and datasets (math problems, medical diagnosis, and physical reasoning), we show that our framework is fairshare for buyers by ensuring their purchased data is reflective of model training value, leading to higher LLM task performances per-dollar spent on data, and fairshare for sellers by ensuring they sell their data at optimal prices. Our framework lays the foundation for future research on equitable and sustainable data markets for large-scale AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00198v1-abstract-full').style.display = 'none'; document.getElementById('2502.00198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19324">arXiv:2501.19324</a> <span> [<a href="https://arxiv.org/pdf/2501.19324">pdf</a>, <a href="https://arxiv.org/format/2501.19324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reward-Guided Speculative Decoding for Efficient LLM Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+B">Baohao Liao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuhui Xu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanze Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junnan Li</a>, <a href="/search/cs?searchtype=author&query=Monz%2C+C">Christof Monz</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+D">Doyen Sahoo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19324v2-abstract-short" style="display: inline;"> We introduce Reward-Guided Speculative Decoding (RSD), a novel framework aimed at improving the efficiency of inference in large language models (LLMs). RSD synergistically combines a lightweight draft model with a more powerful target model, incorporating a controlled bias to prioritize high-reward outputs, in contrast to existing speculative decoding methods that enforce strict unbiasedness. RSD… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19324v2-abstract-full').style.display = 'inline'; document.getElementById('2501.19324v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19324v2-abstract-full" style="display: none;"> We introduce Reward-Guided Speculative Decoding (RSD), a novel framework aimed at improving the efficiency of inference in large language models (LLMs). RSD synergistically combines a lightweight draft model with a more powerful target model, incorporating a controlled bias to prioritize high-reward outputs, in contrast to existing speculative decoding methods that enforce strict unbiasedness. RSD employs a process reward model to evaluate intermediate decoding steps and dynamically decide whether to invoke the target model, optimizing the trade-off between computational cost and output quality. We theoretically demonstrate that a threshold-based mixture strategy achieves an optimal balance between resource utilization and performance. Extensive evaluations on challenging reasoning benchmarks, including Olympiad-level tasks, show that RSD delivers significant efficiency gains against decoding with the target model only (up to 4.4x fewer FLOPs), while achieving significant better accuracy than parallel decoding method on average (up to +3.5). These results highlight RSD as a robust and cost-effective approach for deploying LLMs in resource-intensive scenarios. The code is available at https://github.com/BaohaoLiao/RSD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19324v2-abstract-full').style.display = 'none'; document.getElementById('2501.19324v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05793">arXiv:2501.05793</a> <span> [<a href="https://arxiv.org/pdf/2501.05793">pdf</a>, <a href="https://arxiv.org/format/2501.05793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> ActMiner: Applying Causality Tracking and Increment Aligning for Graph-based Cyber Threat Hunting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tiantian Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tieming Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuang Li</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+J">Jie Ying</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chunlin Xiong</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+M">Mingqi Lv</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05793v1-abstract-short" style="display: inline;"> To defend against Advanced Persistent Threats on the endpoint, threat hunting employs security knowledge such as cyber threat intelligence to continuously analyze system audit logs through retrospective scanning, querying, or pattern matching, aiming to uncover attack patterns/graphs that traditional detection methods (e.g., recognition for Point of Interest) fail to capture. However, existing thr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05793v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05793v1-abstract-full" style="display: none;"> To defend against Advanced Persistent Threats on the endpoint, threat hunting employs security knowledge such as cyber threat intelligence to continuously analyze system audit logs through retrospective scanning, querying, or pattern matching, aiming to uncover attack patterns/graphs that traditional detection methods (e.g., recognition for Point of Interest) fail to capture. However, existing threat hunting systems based on provenance graphs face challenges of high false negatives, high false positives, and low efficiency when confronted with diverse attack tactics and voluminous audit logs. To address these issues, we propose a system called Actminer, which constructs query graphs from descriptive relationships in cyber threat intelligence reports for precise threat hunting (i.e., graph alignment) on provenance graphs. First, we present a heuristic search strategy based on equivalent semantic transfer to reduce false negatives. Second, we establish a filtering mechanism based on causal relationships of attack behaviors to mitigate false positives. Finally, we design a tree structure to incrementally update the alignment results, significantly improving hunting efficiency. Evaluation on the DARPA Engagement dataset demonstrates that compared to the SOTA POIROT, Actminer reduces false positives by 39.1%, eliminates all false negatives, and effectively counters adversarial attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05793v1-abstract-full').style.display = 'none'; document.getElementById('2501.05793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04961">arXiv:2501.04961</a> <span> [<a href="https://arxiv.org/pdf/2501.04961">pdf</a>, <a href="https://arxiv.org/format/2501.04961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Demystifying Domain-adaptive Post-training for Financial LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ke%2C+Z">Zixuan Ke</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+Y">Yifei Ming</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+X">Xuan-Phi Nguyen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Joty%2C+S">Shafiq Joty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04961v2-abstract-short" style="display: inline;"> Domain-adaptive post-training of large language models (LLMs) has emerged as a promising approach for specialized domains such as medicine and finance. However, significant challenges remain in identifying optimal adaptation criteria and training strategies across varying data and model configurations. To address these challenges, we introduce FINDAP, a systematic and fine-grained investigation in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04961v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04961v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04961v2-abstract-full" style="display: none;"> Domain-adaptive post-training of large language models (LLMs) has emerged as a promising approach for specialized domains such as medicine and finance. However, significant challenges remain in identifying optimal adaptation criteria and training strategies across varying data and model configurations. To address these challenges, we introduce FINDAP, a systematic and fine-grained investigation into domain adaptive post-training of LLMs for the finance domain. Our approach consists of four key components: FinCap, which defines the core capabilities required for the target domain; FinRec, an effective training recipe that jointly optimizes continual pre-training and instruction-following, along with a novel preference data distillation method leveraging process signals from a generative reward model; FinTrain, a curated set of training datasets supporting FinRec; and FinEval, a comprehensive evaluation suite aligned with FinCap. The resulting model, Llama-Fin, achieves state-of-the-art performance across a wide range of financial tasks. Our analysis also highlights how each post-training stage contributes to distinct capabilities, uncovering specific challenges and effective solutions, providing valuable insights for domain adaptation of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04961v2-abstract-full').style.display = 'none'; document.getElementById('2501.04961v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01146">arXiv:2501.01146</a> <span> [<a href="https://arxiv.org/pdf/2501.01146">pdf</a>, <a href="https://arxiv.org/ps/2501.01146">ps</a>, <a href="https://arxiv.org/format/2501.01146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> PoVF: Empowering Decentralized Blockchain Systems with Verifiable Function Consensus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenxi Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Ting Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bing Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01146v1-abstract-short" style="display: inline;"> Consensus mechanism is the core technology for blockchain to ensure that transactions are executed in sequence. It also determines the decentralization, security, and efficiency of blockchain. Existing mechanisms all have certain centralization issues and fail to ensure the decentralization of blockchain networks. A decentralized and efficient mechanism is required to improve blockchain systems. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01146v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01146v1-abstract-full" style="display: none;"> Consensus mechanism is the core technology for blockchain to ensure that transactions are executed in sequence. It also determines the decentralization, security, and efficiency of blockchain. Existing mechanisms all have certain centralization issues and fail to ensure the decentralization of blockchain networks. A decentralized and efficient mechanism is required to improve blockchain systems. This paper proposes a fair consensus mechanism called Proof of Verifiable Functions (PoVF), based on the verifiability and unpredictability of verifiable functions. PoVF provides a sufficiently fair mechanism, ensuring that all nodes in blockchain network have equal opportunity to participate in consensus. In addition, a structure called "Delay buffer" is proposed to ensure transactions are executed sequentially. It delay the selection of blocks to avoid blockchain forks caused by broadcasting and transaction execution confusion. According to our security analysis, PoVF is provably secure and has the ability to resist potential adversaries. According to the experiments, PoVF-based blockchain can process up to 4000 transactions per second with nodes configured with only 4-core CPUs. This paper uses the Gini coefficient to measure the decentralization of blockchains, and the PoVF-based blockchain achieves the lowest Gini coefficient of 0.39 among all sampled blockchains. PoVF has been shown to provide sufficient efficiency while ensuring decentralization and security through experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01146v1-abstract-full').style.display = 'none'; document.getElementById('2501.01146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18011">arXiv:2412.18011</a> <span> [<a href="https://arxiv.org/pdf/2412.18011">pdf</a>, <a href="https://arxiv.org/format/2412.18011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> StructTest: Benchmarking LLMs' Reasoning through Compositional Structured Outputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hailin Chen</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+F">Fangkai Jiao</a>, <a href="/search/cs?searchtype=author&query=Ravaut%2C+M">Mathieu Ravaut</a>, <a href="/search/cs?searchtype=author&query=Farruque%2C+N">Nawshad Farruque</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+X+P">Xuan Phi Nguyen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Chengwei Qin</a>, <a href="/search/cs?searchtype=author&query=Dey%2C+M">Manan Dey</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bosheng Ding</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Joty%2C+S">Shafiq Joty</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18011v1-abstract-short" style="display: inline;"> The rapid development of large language models (LLMs) necessitates robust, unbiased, and scalable methods for evaluating their capabilities. However, human annotations are expensive to scale, model-based evaluations are prone to biases in answer style, while target-answer-based benchmarks are vulnerable to data contamination and cheating. To address these limitations, we propose StructTest, a nove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18011v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18011v1-abstract-full" style="display: none;"> The rapid development of large language models (LLMs) necessitates robust, unbiased, and scalable methods for evaluating their capabilities. However, human annotations are expensive to scale, model-based evaluations are prone to biases in answer style, while target-answer-based benchmarks are vulnerable to data contamination and cheating. To address these limitations, we propose StructTest, a novel benchmark that evaluates LLMs on their ability to produce compositionally specified structured outputs as an unbiased, cheap-to-run and difficult-to-cheat measure. The evaluation is done deterministically by a rule-based evaluator, which can be easily extended to new tasks. By testing structured outputs across diverse task domains -- including Summarization, Code, HTML and Math -- we demonstrate that StructTest serves as a good proxy for general reasoning abilities, as producing structured outputs often requires internal logical reasoning. We believe that StructTest offers a critical, complementary approach to objective and robust model evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18011v1-abstract-full').style.display = 'none'; document.getElementById('2412.18011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17847">arXiv:2412.17847</a> <span> [<a href="https://arxiv.org/pdf/2412.17847">pdf</a>, <a href="https://arxiv.org/format/2412.17847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Data Provenance Gap Across Text, Speech and Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+N">Nikhil Singh</a>, <a href="/search/cs?searchtype=author&query=Cherep%2C+M">Manuel Cherep</a>, <a href="/search/cs?searchtype=author&query=Tiwary%2C+K">Kushagra Tiwary</a>, <a href="/search/cs?searchtype=author&query=Materzynska%2C+J">Joanna Materzynska</a>, <a href="/search/cs?searchtype=author&query=Brannon%2C+W">William Brannon</a>, <a href="/search/cs?searchtype=author&query=Mahari%2C+R">Robert Mahari</a>, <a href="/search/cs?searchtype=author&query=Obeng-Marnu%2C+N">Naana Obeng-Marnu</a>, <a href="/search/cs?searchtype=author&query=Dey%2C+M">Manan Dey</a>, <a href="/search/cs?searchtype=author&query=Hamdy%2C+M">Mohammed Hamdy</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+N">Nayan Saxena</a>, <a href="/search/cs?searchtype=author&query=Anis%2C+A+M">Ahmad Mustafa Anis</a>, <a href="/search/cs?searchtype=author&query=Alghamdi%2C+E+A">Emad A. Alghamdi</a>, <a href="/search/cs?searchtype=author&query=Chien%2C+V+M">Vu Minh Chien</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+K">Kun Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+M">Minnie Liang</a>, <a href="/search/cs?searchtype=author&query=Dinh%2C+A">An Dinh</a>, <a href="/search/cs?searchtype=author&query=Mohanty%2C+S">Shrestha Mohanty</a>, <a href="/search/cs?searchtype=author&query=Mataciunas%2C+D">Deividas Mataciunas</a>, <a href="/search/cs?searchtype=author&query=South%2C+T">Tobin South</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A+N">Ariel N. Lee</a>, <a href="/search/cs?searchtype=author&query=Lund%2C+C+S">Campbell S. Lund</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17847v2-abstract-short" style="display: inline;"> Progress in AI is driven largely by the scale and quality of training data. Despite this, there is a deficit of empirical analysis examining the attributes of well-established datasets beyond text. In this work we conduct the largest and first-of-its-kind longitudinal audit across modalities--popular text, speech, and video datasets--from their detailed sourcing trends and use restrictions to thei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17847v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17847v2-abstract-full" style="display: none;"> Progress in AI is driven largely by the scale and quality of training data. Despite this, there is a deficit of empirical analysis examining the attributes of well-established datasets beyond text. In this work we conduct the largest and first-of-its-kind longitudinal audit across modalities--popular text, speech, and video datasets--from their detailed sourcing trends and use restrictions to their geographical and linguistic representation. Our manual analysis covers nearly 4000 public datasets between 1990-2024, spanning 608 languages, 798 sources, 659 organizations, and 67 countries. We find that multimodal machine learning applications have overwhelmingly turned to web-crawled, synthetic, and social media platforms, such as YouTube, for their training sets, eclipsing all other sources since 2019. Secondly, tracing the chain of dataset derivations we find that while less than 33% of datasets are restrictively licensed, over 80% of the source content in widely-used text, speech, and video datasets, carry non-commercial restrictions. Finally, counter to the rising number of languages and geographies represented in public AI training datasets, our audit demonstrates measures of relative geographical and multilingual representation have failed to significantly improve their coverage since 2013. We believe the breadth of our audit enables us to empirically examine trends in data sourcing, restrictions, and Western-centricity at an ecosystem-level, and that visibility into these questions are essential to progress in responsible AI. As a contribution to ongoing improvements in dataset transparency and responsible use, we release our entire multimodal audit, allowing practitioners to trace data provenance across text, speech, and video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17847v2-abstract-full').style.display = 'none'; document.getElementById('2412.17847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025. 10 pages, 5 figures (main paper)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12300">arXiv:2412.12300</a> <span> [<a href="https://arxiv.org/pdf/2412.12300">pdf</a>, <a href="https://arxiv.org/format/2412.12300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unanswerability Evaluation for Retrieval Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiangyu Peng</a>, <a href="/search/cs?searchtype=author&query=Choubey%2C+P+K">Prafulla Kumar Choubey</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12300v2-abstract-short" style="display: inline;"> Existing evaluation frameworks for retrieval-augmented generation (RAG) systems focus on answerable queries, but they overlook the importance of appropriately rejecting unanswerable requests. In this paper, we introduce UAEval4RAG, a framework designed to evaluate whether RAG systems can handle unanswerable queries effectively. We define a taxonomy with six unanswerable categories, and UAEval4RAG… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12300v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12300v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12300v2-abstract-full" style="display: none;"> Existing evaluation frameworks for retrieval-augmented generation (RAG) systems focus on answerable queries, but they overlook the importance of appropriately rejecting unanswerable requests. In this paper, we introduce UAEval4RAG, a framework designed to evaluate whether RAG systems can handle unanswerable queries effectively. We define a taxonomy with six unanswerable categories, and UAEval4RAG automatically synthesizes diverse and challenging queries for any given knowledge base with unanswered ratio and acceptable ratio metrics. We conduct experiments with various RAG components, including retrieval models, rewriting methods, rerankers, language models, and prompting strategies, and reveal hidden trade-offs in performance of RAG systems. Our findings highlight the critical role of component selection and prompt design in optimizing RAG systems to balance the accuracy of answerable queries with high rejection rates of unanswerable ones. UAEval4RAG provides valuable insights and tools for developing more robust and reliable RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12300v2-abstract-full').style.display = 'none'; document.getElementById('2412.12300v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09722">arXiv:2412.09722</a> <span> [<a href="https://arxiv.org/pdf/2412.09722">pdf</a>, <a href="https://arxiv.org/format/2412.09722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GReaTer: Gradients over Reasoning Makes Smaller Language Models Strong Prompt Optimizers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Das%2C+S+S+S">Sarkar Snigdha Sarathi Das</a>, <a href="/search/cs?searchtype=author&query=Kamoi%2C+R">Ryo Kamoi</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yusen Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09722v1-abstract-short" style="display: inline;"> The effectiveness of large language models (LLMs) is closely tied to the design of prompts, making prompt optimization essential for enhancing their performance across a wide range of tasks. Many existing approaches to automating prompt engineering rely exclusively on textual feedback, refining prompts based solely on inference errors identified by large, computationally expensive LLMs. Unfortunat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09722v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09722v1-abstract-full" style="display: none;"> The effectiveness of large language models (LLMs) is closely tied to the design of prompts, making prompt optimization essential for enhancing their performance across a wide range of tasks. Many existing approaches to automating prompt engineering rely exclusively on textual feedback, refining prompts based solely on inference errors identified by large, computationally expensive LLMs. Unfortunately, smaller models struggle to generate high-quality feedback, resulting in complete dependence on large LLM judgment. Moreover, these methods fail to leverage more direct and finer-grained information, such as gradients, due to operating purely in text space. To this end, we introduce GReaTer, a novel prompt optimization technique that directly incorporates gradient information over task-specific reasoning. By utilizing task loss gradients, GReaTer enables self-optimization of prompts for open-source, lightweight language models without the need for costly closed-source LLMs. This allows high-performance prompt optimization without dependence on massive LLMs, closing the gap between smaller models and the sophisticated reasoning often needed for prompt refinement. Extensive evaluations across diverse reasoning tasks including BBH, GSM8k, and FOLIO demonstrate that GReaTer consistently outperforms previous state-of-the-art prompt optimization methods, even those reliant on powerful LLMs. Additionally, GReaTer-optimized prompts frequently exhibit better transferability and, in some cases, boost task performance to levels comparable to or surpassing those achieved by larger language models, highlighting the effectiveness of prompt optimization guided by gradients over reasoning. Code of GReaTer is available at https://github.com/psunlpgroup/GreaTer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09722v1-abstract-full').style.display = 'none'; document.getElementById('2412.09722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09605">arXiv:2412.09605</a> <span> [<a href="https://arxiv.org/pdf/2412.09605">pdf</a>, <a href="https://arxiv.org/format/2412.09605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AgentTrek: Agent Trajectory Synthesis via Guiding Replay with Web Tutorials </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiheng Xu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dunjie Lu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhennan Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junli Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yuchen Mao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09605v1-abstract-short" style="display: inline;"> Graphical User Interface (GUI) agents hold great potential for automating complex tasks across diverse digital environments, from web applications to desktop software. However, the development of such agents is hindered by the lack of high-quality, multi-step trajectory data required for effective training. Existing approaches rely on expensive and labor-intensive human annotation, making them uns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09605v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09605v1-abstract-full" style="display: none;"> Graphical User Interface (GUI) agents hold great potential for automating complex tasks across diverse digital environments, from web applications to desktop software. However, the development of such agents is hindered by the lack of high-quality, multi-step trajectory data required for effective training. Existing approaches rely on expensive and labor-intensive human annotation, making them unsustainable at scale. To address this challenge, we propose AgentTrek, a scalable data synthesis pipeline that generates high-quality GUI agent trajectories by leveraging web tutorials. Our method automatically gathers tutorial-like texts from the internet, transforms them into task goals with step-by-step instructions, and employs a visual-language model agent to simulate their execution in a real digital environment. A VLM-based evaluator ensures the correctness of the generated trajectories. We demonstrate that training GUI agents with these synthesized trajectories significantly improves their grounding and planning performance over the current models. Moreover, our approach is more cost-efficient compared to traditional human annotation methods. This work underscores the potential of guided replay with web tutorials as a viable strategy for large-scale GUI agent training, paving the way for more capable and autonomous digital agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09605v1-abstract-full').style.display = 'none'; document.getElementById('2412.09605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://agenttrek.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08859">arXiv:2412.08859</a> <span> [<a href="https://arxiv.org/pdf/2412.08859">pdf</a>, <a href="https://arxiv.org/format/2412.08859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ViUniT: Visual Unit Tests for More Robust Visual Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Panagopoulou%2C+A">Artemis Panagopoulou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Honglu Zhou</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Callison-Burch%2C+C">Chris Callison-Burch</a>, <a href="/search/cs?searchtype=author&query=Yatskar%2C+M">Mark Yatskar</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08859v1-abstract-short" style="display: inline;"> Programming based approaches to reasoning tasks have substantially expanded the types of questions models can answer about visual scenes. Yet on benchmark visual reasoning data, when models answer correctly, they produce incorrect programs 33% of the time. These models are often right for the wrong reasons and risk unexpected failures on new data. Unit tests play a foundational role in ensuring co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08859v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08859v1-abstract-full" style="display: none;"> Programming based approaches to reasoning tasks have substantially expanded the types of questions models can answer about visual scenes. Yet on benchmark visual reasoning data, when models answer correctly, they produce incorrect programs 33% of the time. These models are often right for the wrong reasons and risk unexpected failures on new data. Unit tests play a foundational role in ensuring code correctness and could be used to repair such failures. We propose Visual Unit Testing (ViUniT), a framework to improve the reliability of visual programs by automatically generating unit tests. In our framework, a unit test is represented as a novel image and answer pair meant to verify the logical correctness of a program produced for a given query. Our method leverages a language model to create unit tests in the form of image descriptions and expected answers and image synthesis to produce corresponding images. We conduct a comprehensive analysis of what constitutes an effective visual unit test suite, exploring unit test generation, sampling strategies, image generation methods, and varying the number of programs and unit tests. Additionally, we introduce four applications of visual unit tests: best program selection, answer refusal, re-prompting, and unsupervised reward formulations for reinforcement learning. Experiments with two models across three datasets in visual question answering and image-text matching demonstrate that ViUniT improves model performance by 11.4%. Notably, it enables 7B open-source models to outperform gpt-4o-mini by an average of 7.7% and reduces the occurrence of programs that are correct for the wrong reasons by 40%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08859v1-abstract-full').style.display = 'none'; document.getElementById('2412.08859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07012">arXiv:2412.07012</a> <span> [<a href="https://arxiv.org/pdf/2412.07012">pdf</a>, <a href="https://arxiv.org/format/2412.07012">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ProVision: Programmatically Scaling Vision-centric Instruction Data for Multimodal Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Le Xue</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linxin Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weikai Huang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zixian Ma</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07012v3-abstract-short" style="display: inline;"> With the rise of multimodal applications, instruction data has become critical for training multimodal language models capable of understanding complex image-based queries. Existing practices rely on powerful but costly large language models (LLMs) or multimodal language models (MLMs) to produce instruction data. These are often prone to hallucinations, licensing issues and the generation process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07012v3-abstract-full').style.display = 'inline'; document.getElementById('2412.07012v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07012v3-abstract-full" style="display: none;"> With the rise of multimodal applications, instruction data has become critical for training multimodal language models capable of understanding complex image-based queries. Existing practices rely on powerful but costly large language models (LLMs) or multimodal language models (MLMs) to produce instruction data. These are often prone to hallucinations, licensing issues and the generation process is often hard to scale and interpret. In this work, we present a programmatic approach that employs scene graphs as symbolic representations of images and human-written programs to systematically synthesize vision-centric instruction data. Our approach ensures the interpretability and controllability of the data generation process and scales efficiently while maintaining factual accuracy. By implementing a suite of 24 single-image, 14 multi-image instruction generators, and a scene graph generation pipeline, we build a scalable, cost-effective system: ProVision which produces diverse question-answer pairs concerning objects, attributes, relations, depth, etc., for any given image. Applied to Visual Genome and DataComp datasets, we generate over 10 million instruction data points, ProVision-10M, and leverage them in both pretraining and instruction tuning stages of MLMs. When adopted in the instruction tuning stage, our single-image instruction data yields up to a 7% improvement on the 2D split and 8% on the 3D split of CVBench, along with a 3% increase in performance on QBench2, RealWorldQA, and MMMU. Our multi-image instruction data leads to an 8% improvement on Mantis-Eval. Incorporation of our data in both pre-training and fine-tuning stages of xGen-MM-4B leads to an averaged improvement of 1.6% across 11 benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07012v3-abstract-full').style.display = 'none'; document.getElementById('2412.07012v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">code: https://github.com/JieyuZ2/ProVision dataset: https://huggingface.co/datasets/Salesforce/ProVision-10M</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06206">arXiv:2412.06206</a> <span> [<a href="https://arxiv.org/pdf/2412.06206">pdf</a>, <a href="https://arxiv.org/format/2412.06206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SiReRAG: Indexing Similar and Related Information for Multihop Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Nan Zhang</a>, <a href="/search/cs?searchtype=author&query=Choubey%2C+P+K">Prafulla Kumar Choubey</a>, <a href="/search/cs?searchtype=author&query=Fabbri%2C+A">Alexander Fabbri</a>, <a href="/search/cs?searchtype=author&query=Bernadett-Shapiro%2C+G">Gabriel Bernadett-Shapiro</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+P">Prasenjit Mitra</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06206v1-abstract-short" style="display: inline;"> Indexing is an important step towards strong performance in retrieval-augmented generation (RAG) systems. However, existing methods organize data based on either semantic similarity (similarity) or related information (relatedness), but do not cover both perspectives comprehensively. Our analysis reveals that modeling only one perspective results in insufficient knowledge synthesis, leading to sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06206v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06206v1-abstract-full" style="display: none;"> Indexing is an important step towards strong performance in retrieval-augmented generation (RAG) systems. However, existing methods organize data based on either semantic similarity (similarity) or related information (relatedness), but do not cover both perspectives comprehensively. Our analysis reveals that modeling only one perspective results in insufficient knowledge synthesis, leading to suboptimal performance on complex tasks requiring multihop reasoning. In this paper, we propose SiReRAG, a novel RAG indexing approach that explicitly considers both similar and related information. On the similarity side, we follow existing work and explore some variances to construct a similarity tree based on recursive summarization. On the relatedness side, SiReRAG extracts propositions and entities from texts, groups propositions via shared entities, and generates recursive summaries to construct a relatedness tree. We index and flatten both similarity and relatedness trees into a unified retrieval pool. Our experiments demonstrate that SiReRAG consistently outperforms state-of-the-art indexing methods on three multihop datasets (MuSiQue, 2WikiMultiHopQA, and HotpotQA), with an average 1.9% improvement in F1 scores. As a reasonably efficient solution, SiReRAG enhances existing reranking methods significantly, with up to 7.8% improvement in average F1 scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06206v1-abstract-full').style.display = 'none'; document.getElementById('2412.06206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05479">arXiv:2412.05479</a> <span> [<a href="https://arxiv.org/pdf/2412.05479">pdf</a>, <a href="https://arxiv.org/format/2412.05479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TACO: Learning Multi-modal Action Models with Synthetic Chains-of-Thought-and-Action </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zixian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05479v2-abstract-short" style="display: inline;"> While open-source multi-modal language models perform well on simple question answering tasks, they often fail on complex questions that require multiple capabilities, such as fine-grained recognition, visual grounding, and reasoning, and that demand multi-step solutions. We present TACO, a family of multi-modal large action models designed to improve performance on such complex, multi-step, and m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05479v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05479v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05479v2-abstract-full" style="display: none;"> While open-source multi-modal language models perform well on simple question answering tasks, they often fail on complex questions that require multiple capabilities, such as fine-grained recognition, visual grounding, and reasoning, and that demand multi-step solutions. We present TACO, a family of multi-modal large action models designed to improve performance on such complex, multi-step, and multi-modal tasks. During inference, TACO produces chains-of-thought-and-action (CoTA), executes intermediate steps by invoking external tools such as OCR, depth estimation and calculator, then integrates both the thoughts and action outputs to produce coherent responses. To train TACO, we create a large dataset of over 1M synthetic CoTA traces generated with GPT-4o and Python programs. We then experiment with various data filtering and mixing techniques and obtain a final subset of 293K high-quality CoTA examples. This dataset enables TACO to learn complex reasoning and action paths, surpassing existing models trained on instruction tuning data with only direct answers. Our model TACO outperforms the instruction-tuned baseline across 8 benchmarks, achieving a 3.6% improvement on average, with gains of up to 15% in MMVet tasks involving OCR, mathematical reasoning, and spatial reasoning. Training on high-quality CoTA traces sets a new standard for complex multi-modal reasoning, highlighting the need for structured, multi-step instruction tuning in advancing open-source mutli-modal models' capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05479v2-abstract-full').style.display = 'none'; document.getElementById('2412.05479v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04454">arXiv:2412.04454</a> <span> [<a href="https://arxiv.org/pdf/2412.04454">pdf</a>, <a href="https://arxiv.org/format/2412.04454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Aguvis: Unified Pure Vision Agents for Autonomous GUI Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiheng Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junli Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dunjie Lu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tianbao Xie</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+A">Amrita Saha</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+D">Doyen Sahoo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04454v1-abstract-short" style="display: inline;"> Graphical User Interfaces (GUIs) are critical to human-computer interaction, yet automating GUI tasks remains challenging due to the complexity and variability of visual environments. Existing approaches often rely on textual representations of GUIs, which introduce limitations in generalization, efficiency, and scalability. In this paper, we introduce Aguvis, a unified pure vision-based framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04454v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04454v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04454v1-abstract-full" style="display: none;"> Graphical User Interfaces (GUIs) are critical to human-computer interaction, yet automating GUI tasks remains challenging due to the complexity and variability of visual environments. Existing approaches often rely on textual representations of GUIs, which introduce limitations in generalization, efficiency, and scalability. In this paper, we introduce Aguvis, a unified pure vision-based framework for autonomous GUI agents that operates across various platforms. Our approach leverages image-based observations, and grounding instructions in natural language to visual elements, and employs a consistent action space to ensure cross-platform generalization. To address the limitations of previous work, we integrate explicit planning and reasoning within the model, enhancing its ability to autonomously navigate and interact with complex digital environments. We construct a large-scale dataset of GUI agent trajectories, incorporating multimodal reasoning and grounding, and employ a two-stage training pipeline that first focuses on general GUI grounding, followed by planning and reasoning. Through comprehensive experiments, we demonstrate that Aguvis surpasses previous state-of-the-art methods in both offline and real-world online scenarios, achieving, to our knowledge, the first fully autonomous pure vision GUI agent capable of performing tasks independently without collaboration with external closed-source models. We open-sourced all datasets, models, and training recipes to facilitate future research at https://aguvis-project.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04454v1-abstract-full').style.display = 'none'; document.getElementById('2412.04454v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://aguvis-project.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03578">arXiv:2412.03578</a> <span> [<a href="https://arxiv.org/pdf/2412.03578">pdf</a>, <a href="https://arxiv.org/format/2412.03578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> PerfCodeGen: Improving Performance of LLM Generated Code with Execution Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yun Peng</a>, <a href="/search/cs?searchtype=author&query=Gotmare%2C+A+D">Akhilesh Deepak Gotmare</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+M">Michael Lyu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+D">Doyen Sahoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03578v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are widely adopted for assisting in software development tasks, yet their performance evaluations have narrowly focused on the functional correctness of generated code. Human programmers, however, require LLM-generated code to be not only correct but also optimally efficient. We propose PerfCodeGen, a training-free framework that enhances the performance of LLM-generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03578v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03578v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are widely adopted for assisting in software development tasks, yet their performance evaluations have narrowly focused on the functional correctness of generated code. Human programmers, however, require LLM-generated code to be not only correct but also optimally efficient. We propose PerfCodeGen, a training-free framework that enhances the performance of LLM-generated code by incorporating feedback based on runtime during test case execution into the self-refinement iterations. With PerfCodeGen, we achieve speedups for a significantly higher proportion of problems compared to using the base LLM with sophisticated prompting techniques. Applied to open language models like Phi-3-mini, PerfCodeGen achieves runtime efficiency comparable to prompting powerful closed models like GPT-4. We achieve state-of-the-art runtime efficiency on benchmarks such as HumanEval, MBPP, and APPS, frequently surpassing the ground truth reference solutions with PerfCodeGen using GPT-3.5 and GPT-4. Additionally, we demonstrate the effectiveness of our approach in enhancing code quality across a range of open LLMs of varying sizes including Phi-3-mini, Llama 3 8B, Mixtral 8x7B, Command R, and Llama 3 70B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03578v1-abstract-full').style.display = 'none'; document.getElementById('2412.03578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14743">arXiv:2411.14743</a> <span> [<a href="https://arxiv.org/pdf/2411.14743">pdf</a>, <a href="https://arxiv.org/format/2411.14743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> FOCUS: Knowledge-enhanced Adaptive Visual Compression for Few-shot Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhengrui Guo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Conghao Xiong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qichen Sun</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lishuang Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinzhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14743v1-abstract-short" style="display: inline;"> Few-shot learning presents a critical solution for cancer diagnosis in computational pathology (CPath), addressing fundamental limitations in data availability, particularly the scarcity of expert annotations and patient privacy constraints. A key challenge in this paradigm stems from the inherent disparity between the limited training set of whole slide images (WSIs) and the enormous number of co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14743v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14743v1-abstract-full" style="display: none;"> Few-shot learning presents a critical solution for cancer diagnosis in computational pathology (CPath), addressing fundamental limitations in data availability, particularly the scarcity of expert annotations and patient privacy constraints. A key challenge in this paradigm stems from the inherent disparity between the limited training set of whole slide images (WSIs) and the enormous number of contained patches, where a significant portion of these patches lacks diagnostically relevant information, potentially diluting the model's ability to learn and focus on critical diagnostic features. While recent works attempt to address this by incorporating additional knowledge, several crucial gaps hinder further progress: (1) despite the emergence of powerful pathology foundation models (FMs), their potential remains largely untapped, with most approaches limiting their use to basic feature extraction; (2) current language guidance mechanisms attempt to align text prompts with vast numbers of WSI patches all at once, struggling to leverage rich pathological semantic information. To this end, we introduce the knowledge-enhanced adaptive visual compression framework, dubbed FOCUS, which uniquely combines pathology FMs with language prior knowledge to enable a focused analysis of diagnostically relevant regions by prioritizing discriminative WSI patches. Our approach implements a progressive three-stage compression strategy: we first leverage FMs for global visual redundancy elimination, and integrate compressed features with language prompts for semantic relevance assessment, then perform neighbor-aware visual token filtering while preserving spatial coherence. Extensive experiments on pathological datasets spanning breast, lung, and ovarian cancers demonstrate its superior performance in few-shot pathology diagnosis. Code will be made available at https://github.com/dddavid4real/FOCUS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14743v1-abstract-full').style.display = 'none'; document.getElementById('2411.14743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13547">arXiv:2411.13547</a> <span> [<a href="https://arxiv.org/pdf/2411.13547">pdf</a>, <a href="https://arxiv.org/format/2411.13547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecTool: A Benchmark for Characterizing Errors in Tool-Use LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Awalgaonkar%2C+T">Tulika Awalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silivo Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13547v1-abstract-short" style="display: inline;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13547v1-abstract-full" style="display: none;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only give a success rate without any explanation of the failure cases. To solve this problem, we introduce SpecTool, a new benchmark to identify error patterns in LLM output on tool-use tasks. Our benchmark data set comprises of queries from diverse environments that can be used to test for the presence of seven newly characterized error patterns. Using SPECTOOL , we show that even the most prominent LLMs exhibit these error patterns in their outputs. Researchers can use the analysis and insights from SPECTOOL to guide their error mitigation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'none'; document.getElementById('2411.13547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12644">arXiv:2411.12644</a> <span> [<a href="https://arxiv.org/pdf/2411.12644">pdf</a>, <a href="https://arxiv.org/format/2411.12644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CodeXEmbed: A Generalist Embedding Model Family for Multiligual and Multi-task Code Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Ye Liu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+R">Rui Meng</a>, <a href="/search/cs?searchtype=author&query=Joty%2C+S">Shafiq Joty</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Yavuz%2C+S">Semih Yavuz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12644v2-abstract-short" style="display: inline;"> Despite the success of text retrieval in many NLP tasks, code retrieval remains a largely underexplored area. Most text retrieval systems are tailored for natural language queries, often neglecting the specific challenges of retrieving code. This gap leaves existing models unable to effectively capture the diversity of programming languages and tasks across different domains, highlighting the need… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12644v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12644v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12644v2-abstract-full" style="display: none;"> Despite the success of text retrieval in many NLP tasks, code retrieval remains a largely underexplored area. Most text retrieval systems are tailored for natural language queries, often neglecting the specific challenges of retrieving code. This gap leaves existing models unable to effectively capture the diversity of programming languages and tasks across different domains, highlighting the need for more focused research in code retrieval. To address this, we introduce CodeXEmbed, a family of large-scale code embedding models ranging from 400M to 7B parameters. Our novel training pipeline unifies multiple programming languages and transforms various code-related tasks into a common retrieval framework, enhancing model generalizability and retrieval performance. Our 7B model sets a new state-of-the-art (SOTA) in code retrieval, outperforming the previous leading model, Voyage-Code, by over 20% on CoIR benchmark. In addition to excelling in code retrieval, our models demonstrate competitive performance on the widely adopted BeIR text retrieval benchmark, offering versatility across domains. Experimental results demonstrate that improving retrieval performance significantly enhances end-to-end Retrieval-Augmented Generation (RAG) performance for code-related tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12644v2-abstract-full').style.display = 'none'; document.getElementById('2411.12644v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08359">arXiv:2411.08359</a> <span> [<a href="https://arxiv.org/pdf/2411.08359">pdf</a>, <a href="https://arxiv.org/format/2411.08359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MultiKG: Multi-Source Threat Intelligence Aggregation for High-Quality Knowledge Graph Representation of Attack Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tiantian Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chunlin Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08359v1-abstract-short" style="display: inline;"> The construction of attack technique knowledge graphs aims to transform various types of attack knowledge into structured representations for more effective attack procedure modeling. Existing methods typically rely on textual data, such as Cyber Threat Intelligence (CTI) reports, which are often coarse-grained and unstructured, resulting in incomplete and inaccurate knowledge graphs. To address t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08359v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08359v1-abstract-full" style="display: none;"> The construction of attack technique knowledge graphs aims to transform various types of attack knowledge into structured representations for more effective attack procedure modeling. Existing methods typically rely on textual data, such as Cyber Threat Intelligence (CTI) reports, which are often coarse-grained and unstructured, resulting in incomplete and inaccurate knowledge graphs. To address these issues, we expand attack knowledge sources by incorporating audit logs and static code analysis alongside CTI reports, providing finer-grained data for constructing attack technique knowledge graphs. We propose MultiKG, a fully automated framework that integrates multiple threat knowledge sources. MultiKG processes data from CTI reports, dynamic logs, and static code separately, then merges them into a unified attack knowledge graph. Through system design and the utilization of the Large Language Model (LLM), MultiKG automates the analysis, construction, and merging of attack graphs across these sources, producing a fine-grained, multi-source attack knowledge graph. We implemented MultiKG and evaluated it using 1,015 real attack techniques and 9,006 attack intelligence entries from CTI reports. Results show that MultiKG effectively extracts attack knowledge graphs from diverse sources and aggregates them into accurate, comprehensive representations. Through case studies, we demonstrate that our approach directly benefits security tasks such as attack reconstruction and detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08359v1-abstract-full').style.display = 'none'; document.getElementById('2411.08359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 15 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07763">arXiv:2411.07763</a> <span> [<a href="https://arxiv.org/pdf/2411.07763">pdf</a>, <a href="https://arxiv.org/format/2411.07763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+F">Fangyu Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuxiao Ye</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+R">Ruisheng Cao</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+D">Dongchan Shin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongjin Su</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+Z">Zhaoqing Suo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenjing Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+P">Pengcheng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+V">Victor Zhong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sida Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07763v1-abstract-short" style="display: inline;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07763v1-abstract-full" style="display: none;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spider 2.0 are sourced from real data applications, often containing over 1,000 columns and stored in local or cloud database systems such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 frequently requires understanding and searching through database metadata, dialect documentation, and even project-level codebases. This challenge calls for models to interact with complex SQL workflow environments, process extremely long contexts, perform intricate reasoning, and generate multiple SQL queries with diverse operations, often exceeding 100 lines, which goes far beyond traditional text-to-SQL challenges. Our evaluations indicate that based on o1-preview, our code agent framework successfully solves only 17.0% of the tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on Spider 2.0 show that while language models have demonstrated remarkable performance in code generation -- especially in prior text-to-SQL benchmarks -- they require significant improvement in order to achieve adequate performance for real-world enterprise usage. Progress on Spider 2.0 represents crucial steps towards developing intelligent, autonomous, code agents for real-world enterprise settings. Our code, baseline models, and data are available at https://spider2-sql.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'none'; document.getElementById('2411.07763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07461">arXiv:2411.07461</a> <span> [<a href="https://arxiv.org/pdf/2411.07461">pdf</a>, <a href="https://arxiv.org/format/2411.07461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Awadalla%2C+A">Anas Awadalla</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Le Xue</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Purushwalkam%2C+S">Senthil Purushwalkam</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hannah Lee</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+O">Oscar Lo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+S">Jae Sung Park</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+E">Etash Guha</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07461v1-abstract-short" style="display: inline;"> We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that bridges the gap between descriptive synthetic captions and factual web-scale alt-text. KALE augments synthetic dense image captions with web-scale alt-text to generate factually grounded image captions. Our two-stage approach leverages large vision-language models and language models to create knowledge-augmented captions, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07461v1-abstract-full" style="display: none;"> We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that bridges the gap between descriptive synthetic captions and factual web-scale alt-text. KALE augments synthetic dense image captions with web-scale alt-text to generate factually grounded image captions. Our two-stage approach leverages large vision-language models and language models to create knowledge-augmented captions, which are then used to train a specialized VLM for scaling up the dataset. We train vision-language models on KALE and demonstrate improvements on vision-language tasks. Our experiments show the utility of KALE for training more capable and knowledgeable multimodal models. We release the KALE dataset at https://huggingface.co/datasets/Salesforce/blip3-kale <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07461v1-abstract-full').style.display = 'none'; document.getElementById('2411.07461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04329">arXiv:2411.04329</a> <span> [<a href="https://arxiv.org/pdf/2411.04329">pdf</a>, <a href="https://arxiv.org/format/2411.04329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CodeTree: Agent-guided Tree Search for Code Generation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jierui Li</a>, <a href="/search/cs?searchtype=author&query=Le%2C+H">Hung Le</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+D">Doyen Sahoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04329v2-abstract-short" style="display: inline;"> Pre-trained on massive amounts of code and text data, large language models (LLMs) have demonstrated remarkable achievements in performing code generation tasks. With additional execution-based feedback, these models can act as agents with capabilities to self-refine and improve generated code autonomously. However, on challenging coding tasks with extremely large search space, current agentic app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04329v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04329v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04329v2-abstract-full" style="display: none;"> Pre-trained on massive amounts of code and text data, large language models (LLMs) have demonstrated remarkable achievements in performing code generation tasks. With additional execution-based feedback, these models can act as agents with capabilities to self-refine and improve generated code autonomously. However, on challenging coding tasks with extremely large search space, current agentic approaches still struggle with multi-stage planning, generating, and debugging. To address this problem, we propose CodeTree, a framework for LLM agents to efficiently explore the search space in different stages of the code generation process. Specifically, we adopted a unified tree structure to explicitly explore different coding strategies, generate corresponding coding solutions, and subsequently refine the solutions. In each stage, critical decision-making (ranking, termination, expanding) of the exploration process is guided by both the environmental execution-based feedback and LLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code generation benchmarks and demonstrated the significant performance gains of CodeTree against strong baselines. Using GPT-4o as the base model, we consistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0 on CodeContests. On the challenging SWEBench benchmark, our approach led to significant performance gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04329v2-abstract-full').style.display = 'none'; document.getElementById('2411.04329v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04282">arXiv:2411.04282</a> <span> [<a href="https://arxiv.org/pdf/2411.04282">pdf</a>, <a href="https://arxiv.org/format/2411.04282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Language Models are Hidden Reasoners: Unlocking Latent Reasoning Capabilities via Self-Rewarding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haolin Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yihao Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+R">Ricky Ho</a>, <a href="/search/cs?searchtype=author&query=Mui%2C+P">Phil Mui</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04282v2-abstract-short" style="display: inline;"> Large language models (LLMs) have shown impressive capabilities, but still struggle with complex reasoning tasks requiring multiple steps. While prompt-based methods like Chain-of-Thought (CoT) can improve LLM reasoning at inference time, optimizing reasoning capabilities during training remains challenging. We introduce LaTent Reasoning Optimization (LaTRO), a principled framework that formulates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04282v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04282v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04282v2-abstract-full" style="display: none;"> Large language models (LLMs) have shown impressive capabilities, but still struggle with complex reasoning tasks requiring multiple steps. While prompt-based methods like Chain-of-Thought (CoT) can improve LLM reasoning at inference time, optimizing reasoning capabilities during training remains challenging. We introduce LaTent Reasoning Optimization (LaTRO), a principled framework that formulates reasoning as sampling from a latent distribution and optimizes it via variational approaches. LaTRO enables LLMs to concurrently improve both their reasoning process and ability to evaluate reasoning quality, without requiring external feedback or reward models. We validate LaTRO through experiments on GSM8K and ARC-Challenge datasets using multiple model architectures. On GSM8K, LaTRO improves zero-shot accuracy by an average of 12.5% over base models and 9.6% over supervised fine-tuning across Phi-3.5-mini, Mistral-7B, and Llama-3.1-8B. Our findings suggest that pre-trained LLMs possess latent reasoning capabilities that can be unlocked and enhanced through our proposed optimization approach in a self-improvement manner. The code of LaTRO is available at \url{https://github.com/SalesforceAIResearch/LaTRO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04282v2-abstract-full').style.display = 'none'; document.getElementById('2411.04282v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02305">arXiv:2411.02305</a> <span> [<a href="https://arxiv.org/pdf/2411.02305">pdf</a>, <a href="https://arxiv.org/format/2411.02305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CRMArena: Understanding the Capacity of LLM Agents to Perform Professional CRM Tasks in Realistic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kung-Hsiang Huang</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Dhawan%2C+S">Sidharth Dhawan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yixin Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Laban%2C+P">Philippe Laban</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02305v2-abstract-short" style="display: inline;"> Customer Relationship Management (CRM) systems are vital for modern enterprises, providing a foundation for managing customer interactions and data. Integrating AI agents into CRM systems can automate routine processes and enhance personalized service. However, deploying and evaluating these agents is challenging due to the lack of realistic benchmarks that reflect the complexity of real-world CRM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02305v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02305v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02305v2-abstract-full" style="display: none;"> Customer Relationship Management (CRM) systems are vital for modern enterprises, providing a foundation for managing customer interactions and data. Integrating AI agents into CRM systems can automate routine processes and enhance personalized service. However, deploying and evaluating these agents is challenging due to the lack of realistic benchmarks that reflect the complexity of real-world CRM tasks. To address this issue, we introduce CRMArena, a novel benchmark designed to evaluate AI agents on realistic tasks grounded in professional work environments. Following guidance from CRM experts and industry best practices, we designed CRMArena with nine customer service tasks distributed across three personas: service agent, analyst, and manager. The benchmark includes 16 commonly used industrial objects (e.g., account, order, knowledge article, case) with high interconnectivity, along with latent variables (e.g., complaint habits, policy violations) to simulate realistic data distributions. Experimental results reveal that state-of-the-art LLM agents succeed in less than 40% of the tasks with ReAct prompting, and less than 55% even with function-calling abilities. Our findings highlight the need for enhanced agent capabilities in function-calling and rule-following to be deployed in real-world work environments. CRMArena is an open challenge to the community: systems that can reliably complete tasks showcase direct business value in a popular work environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02305v2-abstract-full').style.display = 'none'; document.getElementById('2411.02305v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01751">arXiv:2411.01751</a> <span> [<a href="https://arxiv.org/pdf/2411.01751">pdf</a>, <a href="https://arxiv.org/format/2411.01751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RAGViz: Diagnose and Visualize Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tevin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingyuan He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01751v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) combines knowledge from domain-specific sources into large language models to ground answer generation. Current RAG systems lack customizable visibility on the context documents and the model's attentiveness towards such documents. We propose RAGViz, a RAG diagnosis tool that visualizes the attentiveness of the generated tokens in retrieved documents. With a bu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01751v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01751v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) combines knowledge from domain-specific sources into large language models to ground answer generation. Current RAG systems lack customizable visibility on the context documents and the model's attentiveness towards such documents. We propose RAGViz, a RAG diagnosis tool that visualizes the attentiveness of the generated tokens in retrieved documents. With a built-in user interface, retrieval index, and Large Language Model (LLM) backbone, RAGViz provides two main functionalities: (1) token and document-level attention visualization, and (2) generation comparison upon context document addition and removal. As an open-source toolkit, RAGViz can be easily hosted with a custom embedding model and HuggingFace-supported LLM backbone. Using a hybrid ANN (Approximate Nearest Neighbor) index, memory-efficient LLM inference tool, and custom context snippet method, RAGViz operates efficiently with a median query time of about 5 seconds on a moderate GPU node. Our code is available at https://github.com/cxcscmu/RAGViz. A demo video of RAGViz can be found at https://youtu.be/cTAbuTu6ur4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01751v1-abstract-full').style.display = 'none'; document.getElementById('2411.01751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00786">arXiv:2411.00786</a> <span> [<a href="https://arxiv.org/pdf/2411.00786">pdf</a>, <a href="https://arxiv.org/format/2411.00786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Interpret and Control Dense Retrieval with Sparse Latent Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hao Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tevin Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00786v1-abstract-short" style="display: inline;"> Dense embeddings deliver strong retrieval performance but often lack interpretability and controllability. This paper introduces a novel approach using sparse autoencoders (SAE) to interpret and control dense embeddings via the learned latent sparse features. Our key contribution is the development of a retrieval-oriented contrastive loss, which ensures the sparse latent features remain effective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00786v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00786v1-abstract-full" style="display: none;"> Dense embeddings deliver strong retrieval performance but often lack interpretability and controllability. This paper introduces a novel approach using sparse autoencoders (SAE) to interpret and control dense embeddings via the learned latent sparse features. Our key contribution is the development of a retrieval-oriented contrastive loss, which ensures the sparse latent features remain effective for retrieval tasks and thus meaningful to interpret. Experimental results demonstrate that both the learned latent sparse features and their reconstructed embeddings retain nearly the same retrieval accuracy as the original dense vectors, affirming their faithfulness. Our further examination of the sparse latent space reveals interesting features underlying the dense embeddings and we can control the retrieval behaviors via manipulating the latent sparse features, for example, prioritizing documents from specific perspectives in the retrieval results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00786v1-abstract-full').style.display = 'none'; document.getElementById('2411.00786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00142">arXiv:2411.00142</a> <span> [<a href="https://arxiv.org/pdf/2411.00142">pdf</a>, <a href="https://arxiv.org/format/2411.00142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> JudgeRank: Leveraging Large Language Models for Reasoning-Intensive Reranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+T">Tong Niu</a>, <a href="/search/cs?searchtype=author&query=Joty%2C+S">Shafiq Joty</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Ye Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Yavuz%2C+S">Semih Yavuz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00142v1-abstract-short" style="display: inline;"> Accurate document retrieval is crucial for the success of retrieval-augmented generation (RAG) applications, including open-domain question answering and code completion. While large language models (LLMs) have been employed as dense encoders or listwise rerankers in RAG systems, they often struggle with reasoning-intensive tasks because they lack nuanced analysis when judging document relevance.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00142v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00142v1-abstract-full" style="display: none;"> Accurate document retrieval is crucial for the success of retrieval-augmented generation (RAG) applications, including open-domain question answering and code completion. While large language models (LLMs) have been employed as dense encoders or listwise rerankers in RAG systems, they often struggle with reasoning-intensive tasks because they lack nuanced analysis when judging document relevance. To address this limitation, we introduce JudgeRank, a novel agentic reranker that emulates human cognitive processes when assessing document relevance. Our approach consists of three key steps: (1) query analysis to identify the core problem, (2) document analysis to extract a query-aware summary, and (3) relevance judgment to provide a concise assessment of document relevance. We evaluate JudgeRank on the reasoning-intensive BRIGHT benchmark, demonstrating substantial performance improvements over first-stage retrieval methods and outperforming other popular reranking approaches. In addition, JudgeRank performs on par with fine-tuned state-of-the-art rerankers on the popular BEIR benchmark, validating its zero-shot generalization capability. Through comprehensive ablation studies, we demonstrate that JudgeRank's performance generalizes well across LLMs of various sizes while ensembling them yields even more accurate reranking than individual models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00142v1-abstract-full').style.display = 'none'; document.getElementById('2411.00142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21620">arXiv:2410.21620</a> <span> [<a href="https://arxiv.org/pdf/2410.21620">pdf</a>, <a href="https://arxiv.org/format/2410.21620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Asynchronous Tool Usage for Real-Time Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ginart%2C+A+A">Antonio A. Ginart</a>, <a href="/search/cs?searchtype=author&query=Kodali%2C+N">Naveen Kodali</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jason Lee</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+J">John Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21620v1-abstract-short" style="display: inline;"> While frontier large language models (LLMs) are capable tool-using agents, current AI systems still operate in a strict turn-based fashion, oblivious to passage of time. This synchronous design forces user queries and tool-use to occur sequentially, preventing the systems from multitasking and reducing interactivity. To address this limitation, we introduce asynchronous AI agents capable of parall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21620v1-abstract-full" style="display: none;"> While frontier large language models (LLMs) are capable tool-using agents, current AI systems still operate in a strict turn-based fashion, oblivious to passage of time. This synchronous design forces user queries and tool-use to occur sequentially, preventing the systems from multitasking and reducing interactivity. To address this limitation, we introduce asynchronous AI agents capable of parallel processing and real-time tool-use. Our key contribution is an event-driven finite-state machine architecture for agent execution and prompting, integrated with automatic speech recognition and text-to-speech. Drawing inspiration from the concepts originally developed for real-time operating systems, this work presents both a conceptual framework and practical tools for creating AI agents capable of fluid, multitasking interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21620v1-abstract-full').style.display = 'none'; document.getElementById('2410.21620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18528">arXiv:2410.18528</a> <span> [<a href="https://arxiv.org/pdf/2410.18528">pdf</a>, <a href="https://arxiv.org/format/2410.18528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PRACT: Optimizing Principled Reasoning and Acting of LLM Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18528v1-abstract-short" style="display: inline;"> We introduce the Principled Reasoning and Acting (PRAct) framework, a novel method for learning and enforcing action principles from trajectory data. Central to our approach is the use of text gradients from a reflection and optimization engine to derive these action principles. To adapt action principles to specific task requirements, we propose a new optimization framework, Reflective Principle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18528v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18528v1-abstract-full" style="display: none;"> We introduce the Principled Reasoning and Acting (PRAct) framework, a novel method for learning and enforcing action principles from trajectory data. Central to our approach is the use of text gradients from a reflection and optimization engine to derive these action principles. To adapt action principles to specific task requirements, we propose a new optimization framework, Reflective Principle Optimization (RPO). After execution, RPO employs a reflector to critique current action principles and an optimizer to update them accordingly. We develop the RPO framework under two scenarios: Reward-RPO, which uses environmental rewards for reflection, and Self-RPO, which conducts self-reflection without external rewards. Additionally, two RPO methods, RPO-Traj and RPO-Batch, is introduced to adapt to different settings. Experimental results across four environments demonstrate that the PRAct agent, leveraging the RPO framework, effectively learns and applies action principles to enhance performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18528v1-abstract-full').style.display = 'none'; document.getElementById('2410.18528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIG CoNLL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16597">arXiv:2410.16597</a> <span> [<a href="https://arxiv.org/pdf/2410.16597">pdf</a>, <a href="https://arxiv.org/format/2410.16597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Distill-SynthKG: Distilling Knowledge Graph Synthesis Workflow for Improved Coverage and Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choubey%2C+P+K">Prafulla Kumar Choubey</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xin Su</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Man Luo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiangyu Peng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Le%2C+T">Tiep Le</a>, <a href="/search/cs?searchtype=author&query=Rosenman%2C+S">Shachar Rosenman</a>, <a href="/search/cs?searchtype=author&query=Lal%2C+V">Vasudev Lal</a>, <a href="/search/cs?searchtype=author&query=Mui%2C+P">Phil Mui</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+R">Ricky Ho</a>, <a href="/search/cs?searchtype=author&query=Howard%2C+P">Phillip Howard</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16597v1-abstract-short" style="display: inline;"> Knowledge graphs (KGs) generated by large language models (LLMs) are becoming increasingly valuable for Retrieval-Augmented Generation (RAG) applications that require knowledge-intensive reasoning. However, existing KG extraction methods predominantly rely on prompt-based approaches, which are inefficient for processing large-scale corpora. These approaches often suffer from information loss, part… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16597v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16597v1-abstract-full" style="display: none;"> Knowledge graphs (KGs) generated by large language models (LLMs) are becoming increasingly valuable for Retrieval-Augmented Generation (RAG) applications that require knowledge-intensive reasoning. However, existing KG extraction methods predominantly rely on prompt-based approaches, which are inefficient for processing large-scale corpora. These approaches often suffer from information loss, particularly with long documents, due to the lack of specialized design for KG construction. Additionally, there is a gap in evaluation datasets and methodologies for ontology-free KG construction. To overcome these limitations, we propose SynthKG, a multi-step, document-level ontology-free KG synthesis workflow based on LLMs. By fine-tuning a smaller LLM on the synthesized document-KG pairs, we streamline the multi-step process into a single-step KG generation approach called Distill-SynthKG, substantially reducing the number of LLM inference calls. Furthermore, we re-purpose existing question-answering datasets to establish KG evaluation datasets and introduce new evaluation metrics. Using KGs produced by Distill-SynthKG, we also design a novel graph-based retrieval framework for RAG. Experimental results demonstrate that Distill-SynthKG not only surpasses all baseline models in KG quality -- including models up to eight times larger -- but also consistently excels in retrieval and question-answering tasks. Our proposed graph retrieval framework also outperforms all KG-retrieval methods across multiple benchmark datasets. We release the SynthKG dataset and Distill-SynthKG model publicly to support further research and development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16597v1-abstract-full').style.display = 'none'; document.getElementById('2410.16597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16267">arXiv:2410.16267</a> <span> [<a href="https://arxiv.org/pdf/2410.16267">pdf</a>, <a href="https://arxiv.org/format/2410.16267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> xGen-MM-Vid (BLIP-3-Video): You Only Need 32 Tokens to Represent a Video Even in VLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ryoo%2C+M+S">Michael S. Ryoo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Honglu Zhou</a>, <a href="/search/cs?searchtype=author&query=Kendre%2C+S">Shrikant Kendre</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Can Qin</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Le Xue</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16267v1-abstract-short" style="display: inline;"> We present xGen-MM-Vid (BLIP-3-Video): a multimodal language model for videos, particularly designed to efficiently capture temporal information over multiple frames. BLIP-3-Video takes advantage of the 'temporal encoder' in addition to the conventional visual tokenizer, which maps a sequence of tokens over multiple frames into a compact set of visual tokens. This enables BLIP3-Video to use much f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16267v1-abstract-full" style="display: none;"> We present xGen-MM-Vid (BLIP-3-Video): a multimodal language model for videos, particularly designed to efficiently capture temporal information over multiple frames. BLIP-3-Video takes advantage of the 'temporal encoder' in addition to the conventional visual tokenizer, which maps a sequence of tokens over multiple frames into a compact set of visual tokens. This enables BLIP3-Video to use much fewer visual tokens than its competing models (e.g., 32 vs. 4608 tokens). We explore different types of temporal encoders, including learnable spatio-temporal pooling as well as sequential models like Token Turing Machines. We experimentally confirm that BLIP-3-Video obtains video question-answering accuracies comparable to much larger state-of-the-art models (e.g., 34B), while being much smaller (i.e., 4B) and more efficient by using fewer visual tokens. The project website is at https://www.salesforceairesearch.com/opensource/xGen-MM-Vid/index.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16267v1-abstract-full').style.display = 'none'; document.getElementById('2410.16267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15531">arXiv:2410.15531</a> <span> [<a href="https://arxiv.org/pdf/2410.15531">pdf</a>, <a href="https://arxiv.org/format/2410.15531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Do RAG Systems Cover What Matters? Evaluating and Optimizing Responses with Sub-Question Coverage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+K">Kaige Xie</a>, <a href="/search/cs?searchtype=author&query=Laban%2C+P">Philippe Laban</a>, <a href="/search/cs?searchtype=author&query=Choubey%2C+P+K">Prafulla Kumar Choubey</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15531v1-abstract-short" style="display: inline;"> Evaluating retrieval-augmented generation (RAG) systems remains challenging, particularly for open-ended questions that lack definitive answers and require coverage of multiple sub-topics. In this paper, we introduce a novel evaluation framework based on sub-question coverage, which measures how well a RAG system addresses different facets of a question. We propose decomposing questions into sub-q… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15531v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15531v1-abstract-full" style="display: none;"> Evaluating retrieval-augmented generation (RAG) systems remains challenging, particularly for open-ended questions that lack definitive answers and require coverage of multiple sub-topics. In this paper, we introduce a novel evaluation framework based on sub-question coverage, which measures how well a RAG system addresses different facets of a question. We propose decomposing questions into sub-questions and classifying them into three types -- core, background, and follow-up -- to reflect their roles and importance. Using this categorization, we introduce a fine-grained evaluation protocol that provides insights into the retrieval and generation characteristics of RAG systems, including three commercial generative answer engines: You.com, Perplexity AI, and Bing Chat. Interestingly, we find that while all answer engines cover core sub-questions more often than background or follow-up ones, they still miss around 50% of core sub-questions, revealing clear opportunities for improvement. Further, sub-question coverage metrics prove effective for ranking responses, achieving 82% accuracy compared to human preference annotations. Lastly, we also demonstrate that leveraging core sub-questions enhances both retrieval and answer generation in a RAG system, resulting in a 74% win rate over the baseline that lacks sub-questions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15531v1-abstract-full').style.display = 'none'; document.getElementById('2410.15531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14208">arXiv:2410.14208</a> <span> [<a href="https://arxiv.org/pdf/2410.14208">pdf</a>, <a href="https://arxiv.org/format/2410.14208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Montessori-Instruct: Generate Influential Training Data Tailored for Student Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaochuan Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zichun Yu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14208v1-abstract-short" style="display: inline;"> Synthetic data has been widely used to train large language models, but their generative nature inevitably introduces noisy, non-informative, and misleading learning signals. In this paper, we propose Montessori-Instruct, a novel data synthesis framework that tailors the data synthesis ability of the teacher language model toward the student language model's learning process. Specifically, we util… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14208v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14208v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14208v1-abstract-full" style="display: none;"> Synthetic data has been widely used to train large language models, but their generative nature inevitably introduces noisy, non-informative, and misleading learning signals. In this paper, we propose Montessori-Instruct, a novel data synthesis framework that tailors the data synthesis ability of the teacher language model toward the student language model's learning process. Specifically, we utilize local data influence of synthetic training data points on students to characterize students' learning preferences. Then, we train the teacher model with Direct Preference Optimization (DPO) to generate synthetic data tailored toward student learning preferences. Experiments with Llama3-8B-Instruct (teacher) and Llama3-8B (student) on Alpaca Eval and MT-Bench demonstrate that Montessori-Instruct significantly outperforms standard synthesis methods by 18.35\% and 46.24\% relatively. Our method also beats data synthesized by a stronger teacher model, GPT-4o. Further analysis confirms the benefits of teacher's learning to generate more influential training data in the student's improved learning, the advantages of local data influence in accurately measuring student preferences, and the robustness of Montessori-Instruct across different student models. Our code and data are open-sourced at https://github.com/cxcscmu/Montessori-Instruct. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14208v1-abstract-full').style.display = 'none'; document.getElementById('2410.14208v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Codes and data are open-sourced at https://github.com/cxcscmu/Montessori-Instruct</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14180">arXiv:2410.14180</a> <span> [<a href="https://arxiv.org/pdf/2410.14180">pdf</a>, <a href="https://arxiv.org/format/2410.14180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> XForecast: Evaluating Natural Language Explanations for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aksu%2C+T">Taha Aksu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+A">Amrita Saha</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Sarah Tan</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+D">Doyen Sahoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14180v2-abstract-short" style="display: inline;"> Time series forecasting aids decision-making, especially for stakeholders who rely on accurate predictions, making it very important to understand and explain these models to ensure informed decisions. Traditional explainable AI (XAI) methods, which underline feature or temporal importance, often require expert knowledge. In contrast, natural language explanations (NLEs) are more accessible to lay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14180v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14180v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14180v2-abstract-full" style="display: none;"> Time series forecasting aids decision-making, especially for stakeholders who rely on accurate predictions, making it very important to understand and explain these models to ensure informed decisions. Traditional explainable AI (XAI) methods, which underline feature or temporal importance, often require expert knowledge. In contrast, natural language explanations (NLEs) are more accessible to laypeople. However, evaluating forecast NLEs is difficult due to the complex causal relationships in time series data. To address this, we introduce two new performance metrics based on simulatability, assessing how well a human surrogate can predict model forecasts using the explanations. Experiments show these metrics differentiate good from poor explanations and align with human judgments. Utilizing these metrics, we further evaluate the ability of state-of-the-art large language models (LLMs) to generate explanations for time series data, finding that numerical reasoning, rather than model size, is the main factor influencing explanation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14180v2-abstract-full').style.display = 'none'; document.getElementById('2410.14180v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13824">arXiv:2410.13824</a> <span> [<a href="https://arxiv.org/pdf/2410.13824">pdf</a>, <a href="https://arxiv.org/format/2410.13824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Webpage UIs for Text-Rich Visual Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junpeng Liu</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+T">Tianyue Ou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yuxiao Qu</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+W">Wai Lam</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13824v3-abstract-short" style="display: inline;"> Text-rich visual understanding-the ability to process environments where dense textual content is integrated with visuals-is crucial for multimodal large language models (MLLMs) to interact effectively with structured environments. To enhance this capability, we propose synthesizing general multimodal instructions from webpage UIs using text-based large language models (LLMs). Despite lacking dire… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13824v3-abstract-full').style.display = 'inline'; document.getElementById('2410.13824v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13824v3-abstract-full" style="display: none;"> Text-rich visual understanding-the ability to process environments where dense textual content is integrated with visuals-is crucial for multimodal large language models (MLLMs) to interact effectively with structured environments. To enhance this capability, we propose synthesizing general multimodal instructions from webpage UIs using text-based large language models (LLMs). Despite lacking direct visual input, text-based LLMs are able to process structured text representations from webpage accessibility trees. These instructions are then paired with UI screenshots to train multimodal models. We introduce MultiUI, a dataset containing 7.3 million samples from 1 million websites, covering diverse multimodal tasks and UI layouts. Models trained on MultiUI not only excel in web UI tasks-achieving up to a 48% improvement on VisualWebBench and a 19.1% boost in element accuracy on a web agent dataset Mind2Web-but also generalize surprisingly well to non-web UI tasks and even to non-UI domains, such as document understanding, OCR, and chart interpretation. These results highlight the broad applicability of web UI data for advancing text-rich visual understanding across various scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13824v3-abstract-full').style.display = 'none'; document.getElementById('2410.13824v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13509">arXiv:2410.13509</a> <span> [<a href="https://arxiv.org/pdf/2410.13509">pdf</a>, <a href="https://arxiv.org/format/2410.13509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RAG-DDR: Optimizing Retrieval-Augmented Generation Using Differentiable Data Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinze Li</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+S">Sen Mei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zheni Zeng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Ge Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chenyan Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13509v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has proven its effectiveness in mitigating hallucinations in Large Language Models (LLMs) by retrieving knowledge from external resources. To adapt LLMs for RAG pipelines, current approaches use instruction tuning to optimize LLMs, improving their ability to utilize retrieved knowledge. This supervised fine-tuning (SFT) approach focuses on equipping LLMs to han… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13509v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13509v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has proven its effectiveness in mitigating hallucinations in Large Language Models (LLMs) by retrieving knowledge from external resources. To adapt LLMs for RAG pipelines, current approaches use instruction tuning to optimize LLMs, improving their ability to utilize retrieved knowledge. This supervised fine-tuning (SFT) approach focuses on equipping LLMs to handle diverse RAG tasks using different instructions. However, it trains RAG modules to overfit training signals and overlooks the varying data preferences among agents within the RAG system. In this paper, we propose a Differentiable Data Rewards (DDR) method, which end-to-end trains RAG systems by aligning data preferences between different RAG modules. DDR works by collecting the rewards to optimize each agent with a rollout method. This method prompts agents to sample some potential responses as perturbations, evaluates the impact of these perturbations on the whole RAG system, and subsequently optimizes the agent to produce outputs that improve the performance of the RAG system. Our experiments on various knowledge-intensive tasks demonstrate that DDR significantly outperforms the SFT method, particularly for LLMs with smaller-scale parameters that depend more on the retrieved knowledge. Additionally, DDR exhibits a stronger capability to align the data preference between RAG modules. The DDR method makes generation module more effective in extracting key information from documents and mitigating conflicts between parametric memory and external knowledge. All codes are available at https://github.com/OpenMatch/RAG-DDR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13509v1-abstract-full').style.display = 'none'; document.getElementById('2410.13509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13121">arXiv:2410.13121</a> <span> [<a href="https://arxiv.org/pdf/2410.13121">pdf</a>, <a href="https://arxiv.org/format/2410.13121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Trust but Verify: Programmatic VLM Evaluation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prabhu%2C+V">Viraj Prabhu</a>, <a href="/search/cs?searchtype=author&query=Purushwalkam%2C+S">Senthil Purushwalkam</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13121v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) often generate plausible but incorrect responses to visual queries. However, reliably quantifying the effect of such hallucinations in free-form responses to open-ended queries is challenging as it requires visually verifying each claim within the response. We propose Programmatic VLM Evaluation (PROVE), a new benchmarking paradigm for evaluating VLM responses to open… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13121v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13121v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) often generate plausible but incorrect responses to visual queries. However, reliably quantifying the effect of such hallucinations in free-form responses to open-ended queries is challenging as it requires visually verifying each claim within the response. We propose Programmatic VLM Evaluation (PROVE), a new benchmarking paradigm for evaluating VLM responses to open-ended queries. To construct PROVE, we provide a large language model (LLM) with a high-fidelity scene-graph representation constructed from a hyper-detailed image caption, and prompt it to generate diverse question-answer (QA) pairs, as well as programs that can be executed over the scene graph object to verify each QA pair. We thus construct a benchmark of 10.5k challenging but visually grounded QA pairs. Next, to evaluate free-form model responses to queries in PROVE, we propose a programmatic evaluation strategy that measures both the helpfulness and truthfulness of a response within a unified scene graph-based framework. We benchmark the helpfulness-truthfulness trade-offs of a range of VLMs on PROVE, finding that very few are in-fact able to achieve a good balance between the two. Project page: \url{https://prove-explorer.netlify.app/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13121v1-abstract-full').style.display = 'none'; document.getElementById('2410.13121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11209">arXiv:2410.11209</a> <span> [<a href="https://arxiv.org/pdf/2410.11209">pdf</a>, <a href="https://arxiv.org/format/2410.11209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> CRUcialG: Reconstruct Integrated Attack Scenario Graphs by Cyber Threat Intelligence Reports </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wenrui Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tiantian Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tieming Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Q">Qixuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+J">Jie Ying</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongmei Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chunlin Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingda Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+M">Mingqi Lv</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11209v1-abstract-short" style="display: inline;"> Cyber Threat Intelligence (CTI) reports are factual records compiled by security analysts through their observations of threat events or their own practical experience with attacks. In order to utilize CTI reports for attack detection, existing methods have attempted to map the content of reports onto system-level attack provenance graphs to clearly depict attack procedures. However, existing stud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11209v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11209v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11209v1-abstract-full" style="display: none;"> Cyber Threat Intelligence (CTI) reports are factual records compiled by security analysts through their observations of threat events or their own practical experience with attacks. In order to utilize CTI reports for attack detection, existing methods have attempted to map the content of reports onto system-level attack provenance graphs to clearly depict attack procedures. However, existing studies on constructing graphs from CTI reports suffer from problems such as weak natural language processing (NLP) capabilities, discrete and fragmented graphs, and insufficient attack semantic representation. Therefore, we propose a system called CRUcialG for the automated reconstruction of attack scenario graphs (ASGs) by CTI reports. First, we use NLP models to extract systematic attack knowledge from CTI reports to form preliminary ASGs. Then, we propose a four-phase attack rationality verification framework from the tactical phase with attack procedure to evaluate the reasonability of ASGs. Finally, we implement the relation repair and phase supplement of ASGs by adopting a serialized graph generation model. We collect a total of 10,607 CTI reports and generate 5,761 complete ASGs. Experimental results on CTI reports from 30 security vendors and DARPA show that the similarity of ASG reconstruction by CRUcialG can reach 84.54%. Compared with SOTA (EXTRACTOR and AttackG), the recall of CRUcialG (extraction of real attack events) can reach 88.13% and 94.46% respectively, which is 40% higher than SOTA on average. The F1-score of attack phase verification is able to reach 90.04%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11209v1-abstract-full').style.display = 'none'; document.getElementById('2410.11209v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xiong%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository