Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 155 results for author: <span class="mathjax">Stoica, I</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Stoica%2C+I">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Stoica, I"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Stoica%2C+I&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Stoica, I"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14855">arXiv:2502.14855</a> <span> [<a href="https://arxiv.org/pdf/2502.14855">pdf</a>, <a href="https://arxiv.org/format/2502.14855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Prompt-to-Leaderboard </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Frick%2C+E">Evan Frick</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Connor Chen</a>, <a href="/search/cs?searchtype=author&query=Tennyson%2C+J">Joseph Tennyson</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianle Li</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Angelopoulos%2C+A+N">Anastasios N. Angelopoulos</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14855v1-abstract-short" style="display: inline;"> Large language model (LLM) evaluations typically rely on aggregated metrics like accuracy or human preference, averaging across users and prompts. This averaging obscures user- and prompt-specific variations in model performance. To address this, we propose Prompt-to-Leaderboard (P2L), a method that produces leaderboards specific to a prompt. The core idea is to train an LLM taking natural languag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14855v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14855v1-abstract-full" style="display: none;"> Large language model (LLM) evaluations typically rely on aggregated metrics like accuracy or human preference, averaging across users and prompts. This averaging obscures user- and prompt-specific variations in model performance. To address this, we propose Prompt-to-Leaderboard (P2L), a method that produces leaderboards specific to a prompt. The core idea is to train an LLM taking natural language prompts as input to output a vector of Bradley-Terry coefficients which are then used to predict the human preference vote. The resulting prompt-dependent leaderboards allow for unsupervised task-specific evaluation, optimal routing of queries to models, personalization, and automated evaluation of model strengths and weaknesses. Data from Chatbot Arena suggest that P2L better captures the nuanced landscape of language model performance than the averaged leaderboard. Furthermore, our findings suggest that P2L's ability to produce prompt-specific evaluations follows a power law scaling similar to that observed in LLMs themselves. In January 2025, the router we trained based on this methodology achieved the \#1 spot in the Chatbot Arena leaderboard. Our code is available at this GitHub link: https://github.com/lmarena/p2l. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14855v1-abstract-full').style.display = 'none'; document.getElementById('2502.14855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14815">arXiv:2502.14815</a> <span> [<a href="https://arxiv.org/pdf/2502.14815">pdf</a>, <a href="https://arxiv.org/format/2502.14815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Model Selection for Compound AI Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingjiao Chen</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+J+Q">Jared Quincy Davis</a>, <a href="/search/cs?searchtype=author&query=Hanin%2C+B">Boris Hanin</a>, <a href="/search/cs?searchtype=author&query=Bailis%2C+P">Peter Bailis</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">James Zou</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14815v1-abstract-short" style="display: inline;"> Compound AI systems that combine multiple LLM calls, such as self-refine and multi-agent-debate, achieve strong performance on many AI tasks. We address a core question in optimizing compound systems: for each LLM call or module in the system, how should one decide which LLM to use? We show that these LLM choices have a large effect on quality, but the search space is exponential. We propose LLMSe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14815v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14815v1-abstract-full" style="display: none;"> Compound AI systems that combine multiple LLM calls, such as self-refine and multi-agent-debate, achieve strong performance on many AI tasks. We address a core question in optimizing compound systems: for each LLM call or module in the system, how should one decide which LLM to use? We show that these LLM choices have a large effect on quality, but the search space is exponential. We propose LLMSelector, an efficient framework for model selection in compound systems, which leverages two key empirical insights: (i) end-to-end performance is often monotonic in how well each module performs, with all other modules held fixed, and (ii) per-module performance can be estimated accurately by an LLM. Building upon these insights, LLMSelector iteratively selects one module and allocates to it the model with the highest module-wise performance, as estimated by an LLM, until no further gain is possible. LLMSelector is applicable to any compound system with a bounded number of modules, and its number of API calls scales linearly with the number of modules, achieving high-quality model allocation both empirically and theoretically. Experiments with popular compound systems such as multi-agent debate and self-refine using LLMs such as GPT-4o, Claude 3.5 Sonnet and Gemini 1.5 show that LLMSelector confers 5%-70% accuracy gains compared to using the same LLM for all modules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14815v1-abstract-full').style.display = 'none'; document.getElementById('2502.14815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14382">arXiv:2502.14382</a> <span> [<a href="https://arxiv.org/pdf/2502.14382">pdf</a>, <a href="https://arxiv.org/format/2502.14382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> S*: Test Time Scaling for Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chengkun Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Shangyin Tan</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+J">Jiarong Xing</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14382v1-abstract-short" style="display: inline;"> Increasing test-time compute for LLMs shows promise across domains but remains underexplored in code generation, despite extensive study in math. In this paper, we propose S*, the first hybrid test-time scaling framework that substantially improves the coverage and selection accuracy of generated code. S* extends the existing parallel scaling paradigm with sequential scaling to push performance bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14382v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14382v1-abstract-full" style="display: none;"> Increasing test-time compute for LLMs shows promise across domains but remains underexplored in code generation, despite extensive study in math. In this paper, we propose S*, the first hybrid test-time scaling framework that substantially improves the coverage and selection accuracy of generated code. S* extends the existing parallel scaling paradigm with sequential scaling to push performance boundaries. It further leverages a novel selection mechanism that adaptively generates distinguishing inputs for pairwise comparison, combined with execution-grounded information to robustly identify correct solutions. We evaluate across 12 Large Language Models and Large Reasoning Model and show: (1) S* consistently improves performance across model families and sizes, enabling a 3B model to outperform GPT-4o-mini; (2) S* enables non-reasoning models to surpass reasoning models - GPT-4o-mini with S* outperforms o1-preview by 3.7% on LiveCodeBench; (3) S* further boosts state-of-the-art reasoning models - DeepSeek-R1-Distill-Qwen-32B with S* achieves 85.7% on LiveCodeBench, approaching o1 (high) at 88.5%. Code will be available under https://github.com/NovaSky-AI/SkyThought. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14382v1-abstract-full').style.display = 'none'; document.getElementById('2502.14382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13965">arXiv:2502.13965</a> <span> [<a href="https://arxiv.org/pdf/2502.13965">pdf</a>, <a href="https://arxiv.org/format/2502.13965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Autellix: An Efficient Serving Engine for LLM Agents as General Programs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+M">Michael Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaoxiang Shi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Colin Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+J">Justin Wong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanping Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13965v1-abstract-short" style="display: inline;"> Large language model (LLM) applications are evolving beyond simple chatbots into dynamic, general-purpose agentic programs, which scale LLM calls and output tokens to help AI agents reason, explore, and solve complex tasks. However, existing LLM serving systems ignore dependencies between programs and calls, missing significant opportunities for optimization. Our analysis reveals that programs sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13965v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13965v1-abstract-full" style="display: none;"> Large language model (LLM) applications are evolving beyond simple chatbots into dynamic, general-purpose agentic programs, which scale LLM calls and output tokens to help AI agents reason, explore, and solve complex tasks. However, existing LLM serving systems ignore dependencies between programs and calls, missing significant opportunities for optimization. Our analysis reveals that programs submitted to LLM serving engines experience long cumulative wait times, primarily due to head-of-line blocking at both the individual LLM request and the program. To address this, we introduce Autellix, an LLM serving system that treats programs as first-class citizens to minimize their end-to-end latencies. Autellix intercepts LLM calls submitted by programs, enriching schedulers with program-level context. We propose two scheduling algorithms-for single-threaded and distributed programs-that preempt and prioritize LLM calls based on their programs' previously completed calls. Our evaluation demonstrates that across diverse LLMs and agentic workloads, Autellix improves throughput of programs by 4-15x at the same latency compared to state-of-the-art systems, such as vLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13965v1-abstract-full').style.display = 'none'; document.getElementById('2502.13965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09328">arXiv:2502.09328</a> <span> [<a href="https://arxiv.org/pdf/2502.09328">pdf</a>, <a href="https://arxiv.org/format/2502.09328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Copilot Arena: A Platform for Code LLM Evaluation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chi%2C+W">Wayne Chi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+V">Valerie Chen</a>, <a href="/search/cs?searchtype=author&query=Angelopoulos%2C+A+N">Anastasios Nikolas Angelopoulos</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Mittal%2C+A">Aditya Mittal</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+N">Naman Jain</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Donahue%2C+C">Chris Donahue</a>, <a href="/search/cs?searchtype=author&query=Talwalkar%2C+A">Ameet Talwalkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09328v1-abstract-short" style="display: inline;"> Evaluating in-the-wild coding capabilities of large language models (LLMs) is a challenging endeavor with no clear solution. We introduce Copilot Arena, a platform to collect user preferences for code generation through native integration into a developer's working environment. Copilot Arena comprises a novel interface for comparing pairs of model outputs, a sampling strategy optimized to reduce l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09328v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09328v1-abstract-full" style="display: none;"> Evaluating in-the-wild coding capabilities of large language models (LLMs) is a challenging endeavor with no clear solution. We introduce Copilot Arena, a platform to collect user preferences for code generation through native integration into a developer's working environment. Copilot Arena comprises a novel interface for comparing pairs of model outputs, a sampling strategy optimized to reduce latency, and a prompting scheme to enable code completion functionality. Copilot Arena has served over 4.5 million suggestions from 10 models and collected over 11k pairwise judgements. Our results highlight the importance of model evaluations in integrated settings. We find that model rankings from Copilot Arena differ from those of existing evaluations, which we attribute to the more realistic distribution of data and tasks contained in Copilot Arena. We also identify novel insights into human preferences on code such as an observed consistency in user preference across programming languages yet significant variation in preference due to task category. We open-source Copilot Arena and release data to enable human-centric evaluations and improve understanding of coding assistants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09328v1-abstract-full').style.display = 'none'; document.getElementById('2502.09328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08235">arXiv:2502.08235</a> <span> [<a href="https://arxiv.org/pdf/2502.08235">pdf</a>, <a href="https://arxiv.org/format/2502.08235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Danger of Overthinking: Examining the Reasoning-Action Dilemma in Agentic Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cuadron%2C+A">Alejandro Cuadron</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenjie Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingyao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Siyuan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Schroeder%2C+L+G">Luis Gaspar Schroeder</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Huanzhi Mao</a>, <a href="/search/cs?searchtype=author&query=Thumiger%2C+N">Nicholas Thumiger</a>, <a href="/search/cs?searchtype=author&query=Desai%2C+A">Aditya Desai</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Klimovic%2C+A">Ana Klimovic</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08235v1-abstract-short" style="display: inline;"> Large Reasoning Models (LRMs) represent a breakthrough in AI problem-solving capabilities, but their effectiveness in interactive environments can be limited. This paper introduces and analyzes overthinking in LRMs. A phenomenon where models favor extended internal reasoning chains over environmental interaction. Through experiments on software engineering tasks using SWE Bench Verified, we observ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08235v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08235v1-abstract-full" style="display: none;"> Large Reasoning Models (LRMs) represent a breakthrough in AI problem-solving capabilities, but their effectiveness in interactive environments can be limited. This paper introduces and analyzes overthinking in LRMs. A phenomenon where models favor extended internal reasoning chains over environmental interaction. Through experiments on software engineering tasks using SWE Bench Verified, we observe three recurring patterns: Analysis Paralysis, Rogue Actions, and Premature Disengagement. We propose a framework to study these behaviors, which correlates with human expert assessments, and analyze 4018 trajectories. We observe that higher overthinking scores correlate with decreased performance, with reasoning models exhibiting stronger tendencies toward overthinking compared to non-reasoning models. Our analysis reveals that simple efforts to mitigate overthinking in agentic environments, such as selecting the solution with the lower overthinking score, can improve model performance by almost 30% while reducing computational costs by 43%. These results suggest that mitigating overthinking has strong practical implications. We suggest that by leveraging native function-calling capabilities and selective reinforcement learning overthinking tendencies could be mitigated. We also open-source our evaluation framework and dataset to facilitate research in this direction at https://github.com/AlexCuadron/Overthinking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08235v1-abstract-full').style.display = 'none'; document.getElementById('2502.08235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07374">arXiv:2502.07374</a> <span> [<a href="https://arxiv.org/pdf/2502.07374">pdf</a>, <a href="https://arxiv.org/format/2502.07374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLMs Can Easily Learn to Reason from Demonstrations Structure, not content, is what matters! </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Griggs%2C+T">Tyler Griggs</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xiangxi Mo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+E">Eric Tang</a>, <a href="/search/cs?searchtype=author&query=Hegde%2C+S">Sumanth Hegde</a>, <a href="/search/cs?searchtype=author&query=Hakhamaneshi%2C+K">Kourosh Hakhamaneshi</a>, <a href="/search/cs?searchtype=author&query=Patil%2C+S+G">Shishir G. Patil</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07374v2-abstract-short" style="display: inline;"> Large reasoning models (LRMs) tackle complex reasoning problems by following long chain-of-thoughts (Long CoT) that incorporate reflection, backtracking, and self-validation. However, the training techniques and data requirements to elicit Long CoT remain poorly understood. In this work, we find that a Large Language model (LLM) can effectively learn Long CoT reasoning through data-efficient super… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07374v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07374v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07374v2-abstract-full" style="display: none;"> Large reasoning models (LRMs) tackle complex reasoning problems by following long chain-of-thoughts (Long CoT) that incorporate reflection, backtracking, and self-validation. However, the training techniques and data requirements to elicit Long CoT remain poorly understood. In this work, we find that a Large Language model (LLM) can effectively learn Long CoT reasoning through data-efficient supervised fine-tuning (SFT) and parameter-efficient low-rank adaptation (LoRA). With just 17k long CoT training samples, the Qwen2.5-32B-Instruct model achieves significant improvements on a wide range of math and coding benchmarks, including 56.7% (+40.0%) on AIME 2024 and 57.0% (+8.1%) on LiveCodeBench, competitive to the proprietary o1-preview model's score of 44.6% and 59.1%. More importantly, we find that the structure of Long CoT is critical to the learning process, whereas the content of individual reasoning steps has minimal impact. Perturbations affecting content, such as training on incorrect samples or removing reasoning keywords, have little impact on performance. In contrast, structural modifications that disrupt logical consistency in the Long CoT, such as shuffling or deleting reasoning steps, significantly degrade accuracy. For example, a model trained on Long CoT samples with incorrect answers still achieves only 3.2% lower accuracy compared to training with fully correct samples. These insights deepen our understanding of how to elicit reasoning capabilities in LLMs and highlight key considerations for efficiently training the next generation of reasoning models. This is the academic paper of our previous released Sky-T1-32B-Preview model. Codes are available at https://github.com/NovaSky-AI/SkyThought. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07374v2-abstract-full').style.display = 'none'; document.getElementById('2502.07374v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06155">arXiv:2502.06155</a> <span> [<a href="https://arxiv.org/pdf/2502.06155">pdf</a>, <a href="https://arxiv.org/format/2502.06155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient-vDiT: Efficient Video Diffusion Transformers With Attention Tile </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hangliang Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+R">Runlong Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhijie Deng</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06155v2-abstract-short" style="display: inline;"> Despite the promise of synthesizing high-fidelity videos, Diffusion Transformers (DiTs) with 3D full attention suffer from expensive inference due to the complexity of attention computation and numerous sampling steps. For example, the popular Open-Sora-Plan model consumes more than 9 minutes for generating a single video of 29 frames. This paper addresses the inefficiency issue from two aspects:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06155v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06155v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06155v2-abstract-full" style="display: none;"> Despite the promise of synthesizing high-fidelity videos, Diffusion Transformers (DiTs) with 3D full attention suffer from expensive inference due to the complexity of attention computation and numerous sampling steps. For example, the popular Open-Sora-Plan model consumes more than 9 minutes for generating a single video of 29 frames. This paper addresses the inefficiency issue from two aspects: 1) Prune the 3D full attention based on the redundancy within video data; We identify a prevalent tile-style repetitive pattern in the 3D attention maps for video data, and advocate a new family of sparse 3D attention that holds a linear complexity w.r.t. the number of video frames. 2) Shorten the sampling process by adopting existing multi-step consistency distillation; We split the entire sampling trajectory into several segments and perform consistency distillation within each one to activate few-step generation capacities. We further devise a three-stage training pipeline to conjoin the low-complexity attention and few-step generation capacities. Notably, with 0.1% pretraining data, we turn the Open-Sora-Plan-1.2 model into an efficient one that is 7.4x -7.8x faster for 29 and 93 frames 720p video generation with a marginal performance trade-off in VBench. In addition, we demonstrate that our approach is amenable to distributed inference, achieving an additional 3.91x speedup when running on 4 GPUs with sequence parallelism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06155v2-abstract-full').style.display = 'none'; document.getElementById('2502.06155v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04507">arXiv:2502.04507</a> <span> [<a href="https://arxiv.org/pdf/2502.04507">pdf</a>, <a href="https://arxiv.org/format/2502.04507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fast Video Generation with Sliding Tile Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongqi Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+R">Runlong Su</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hangliang Ding</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenghong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04507v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiTs) with 3D full attention power state-of-the-art video generation, but suffer from prohibitive compute cost -- when generating just a 5-second 720P video, attention alone takes 800 out of 945 seconds of total inference time. This paper introduces sliding tile attention (STA) to address this challenge. STA leverages the observation that attention scores in pretrained vide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04507v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04507v1-abstract-full" style="display: none;"> Diffusion Transformers (DiTs) with 3D full attention power state-of-the-art video generation, but suffer from prohibitive compute cost -- when generating just a 5-second 720P video, attention alone takes 800 out of 945 seconds of total inference time. This paper introduces sliding tile attention (STA) to address this challenge. STA leverages the observation that attention scores in pretrained video diffusion models predominantly concentrate within localized 3D windows. By sliding and attending over the local spatial-temporal region, STA eliminates redundancy from full attention. Unlike traditional token-wise sliding window attention (SWA), STA operates tile-by-tile with a novel hardware-aware sliding window design, preserving expressiveness while being hardware-efficient. With careful kernel-level optimizations, STA offers the first efficient 2D/3D sliding-window-like attention implementation, achieving 58.79% MFU. Precisely, STA accelerates attention by 2.8-17x over FlashAttention-2 (FA2) and 1.6-10x over FlashAttention-3 (FA3). On the leading video DiT, HunyuanVideo, STA reduces end-to-end latency from 945s (FA3) to 685s without quality degradation, requiring no training. Enabling finetuning further lowers latency to 268s with only a 0.09% drop on VBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04507v1-abstract-full').style.display = 'none'; document.getElementById('2502.04507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02770">arXiv:2502.02770</a> <span> [<a href="https://arxiv.org/pdf/2502.02770">pdf</a>, <a href="https://arxiv.org/format/2502.02770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Twilight: Adaptive Attention Sparsity with Hierarchical Top-$p$ Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chaofan Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiaming Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanshuo Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tian Tang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+B">Boyu Tian</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02770v2-abstract-short" style="display: inline;"> Leveraging attention sparsity to accelerate long-context large language models (LLMs) has been a hot research topic. However, current algorithms such as sparse attention or key-value (KV) cache compression tend to use a fixed budget, which presents a significant challenge during deployment because it fails to account for the dynamic nature of real-world scenarios, where the optimal balance between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02770v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02770v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02770v2-abstract-full" style="display: none;"> Leveraging attention sparsity to accelerate long-context large language models (LLMs) has been a hot research topic. However, current algorithms such as sparse attention or key-value (KV) cache compression tend to use a fixed budget, which presents a significant challenge during deployment because it fails to account for the dynamic nature of real-world scenarios, where the optimal balance between accuracy and efficiency can vary greatly. In this paper, we find that borrowing top-$p$ sampling (nucleus sampling) to sparse attention can surprisingly achieve adaptive budgeting. Based on this, we propose Twilight, a framework to bring adaptive sparsity to any existing sparse attention algorithm without sacrificing their accuracy. Empirical results show that Twilight can adaptively prune at most 98% of redundant tokens, leading to $15.4\times$ acceleration in self-attention operations and $3.9\times$ acceleration in end-to-end per token latency in long context LLM decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02770v2-abstract-full').style.display = 'none'; document.getElementById('2502.02770v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01776">arXiv:2502.01776</a> <span> [<a href="https://arxiv.org/pdf/2502.01776">pdf</a>, <a href="https://arxiv.org/format/2502.01776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sparse VideoGen: Accelerating Video Diffusion Transformers with Spatial-Temporal Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haocheng Xi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilong Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Muyang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujun Lin</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Han Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01776v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiTs) dominate video generation but their high computational cost severely limits real-world applicability, usually requiring tens of minutes to generate a few seconds of video even on high-performance GPUs. This inefficiency primarily arises from the quadratic computational complexity of 3D Full Attention with respect to the context length. In this paper, we propose a trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01776v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01776v1-abstract-full" style="display: none;"> Diffusion Transformers (DiTs) dominate video generation but their high computational cost severely limits real-world applicability, usually requiring tens of minutes to generate a few seconds of video even on high-performance GPUs. This inefficiency primarily arises from the quadratic computational complexity of 3D Full Attention with respect to the context length. In this paper, we propose a training-free framework termed Sparse VideoGen (SVG) that leverages the inherent sparsity in 3D Full Attention to boost inference efficiency. We reveal that the attention heads can be dynamically classified into two groups depending on distinct sparse patterns: (1) Spatial Head, where only spatially-related tokens within each frame dominate the attention output, and (2) Temporal Head, where only temporally-related tokens across different frames dominate. Based on this insight, SVG proposes an online profiling strategy to capture the dynamic sparse patterns and predicts the type of attention head. Combined with a novel hardware-efficient tensor layout transformation and customized kernel implementations, SVG achieves up to 2.28x and 2.33x end-to-end speedup on CogVideoX-v1.5 and HunyuanVideo, respectively, while preserving generation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01776v1-abstract-full').style.display = 'none'; document.getElementById('2502.01776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01697">arXiv:2502.01697</a> <span> [<a href="https://arxiv.org/pdf/2502.01697">pdf</a>, <a href="https://arxiv.org/format/2502.01697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BARE: Combining Base and Instruction-Tuned Language Models for Better Synthetic Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+A">Alan Zhu</a>, <a href="/search/cs?searchtype=author&query=Asawa%2C+P">Parth Asawa</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+J+Q">Jared Quincy Davis</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingjiao Chen</a>, <a href="/search/cs?searchtype=author&query=Hanin%2C+B">Boris Hanin</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01697v2-abstract-short" style="display: inline;"> As the demand for high-quality data in model training grows, researchers and developers are increasingly generating synthetic data to tune and train LLMs. A common assumption about synthetic data is that sampling from instruct-tuned models is sufficient; however, these models struggle to produce diverse outputs-a key requirement for generalization. Despite various prompting methods, in this work w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01697v2-abstract-full').style.display = 'inline'; document.getElementById('2502.01697v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01697v2-abstract-full" style="display: none;"> As the demand for high-quality data in model training grows, researchers and developers are increasingly generating synthetic data to tune and train LLMs. A common assumption about synthetic data is that sampling from instruct-tuned models is sufficient; however, these models struggle to produce diverse outputs-a key requirement for generalization. Despite various prompting methods, in this work we show that achieving meaningful diversity from instruct-tuned models remains challenging. In contrast, we find base models without post-training exhibit greater diversity, but are less capable at instruction following and hence of lower quality. Leveraging this insight, we propose Base-Refine (BARE), a synthetic data generation method that combines the diversity of base models with the quality of instruct-tuned models through a two-stage process. With minimal few-shot examples and curation, BARE generates diverse and high-quality datasets, improving downstream task performance. We show that fine-tuning with as few as 1,000 BARE-generated samples can reach performance comparable to the best similarly sized models on LiveCodeBench tasks. Furthermore, fine-tuning with BARE-generated data achieves a 101% improvement over instruct-only data on GSM8K and a 18.4% improvement over SOTA methods on RAFT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01697v2-abstract-full').style.display = 'none'; document.getElementById('2502.01697v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14312">arXiv:2501.14312</a> <span> [<a href="https://arxiv.org/pdf/2501.14312">pdf</a>, <a href="https://arxiv.org/format/2501.14312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Locality-aware Fair Scheduling in LLM Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichuan Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+P">Pin-Lun Hsu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Liangsheng Yin</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yineng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Y">Ying Sheng</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J">Joseph Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14312v1-abstract-short" style="display: inline;"> Large language model (LLM) inference workload dominates a wide variety of modern AI applications, ranging from multi-turn conversation to document analysis. Balancing fairness and efficiency is critical for managing diverse client workloads with varying prefix patterns. Unfortunately, existing fair scheduling algorithms for LLM serving, such as Virtual Token Counter (VTC), fail to take prefix loca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14312v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14312v1-abstract-full" style="display: none;"> Large language model (LLM) inference workload dominates a wide variety of modern AI applications, ranging from multi-turn conversation to document analysis. Balancing fairness and efficiency is critical for managing diverse client workloads with varying prefix patterns. Unfortunately, existing fair scheduling algorithms for LLM serving, such as Virtual Token Counter (VTC), fail to take prefix locality into consideration and thus suffer from poor performance. On the other hand, locality-aware scheduling algorithms in existing LLM serving frameworks tend to maximize the prefix cache hit rate without considering fair sharing among clients. This paper introduces the first locality-aware fair scheduling algorithm, Deficit Longest Prefix Match (DLPM), which can maintain a high degree of prefix locality with a fairness guarantee. We also introduce a novel algorithm, Double Deficit LPM (D$^2$LPM), extending DLPM for the distributed setup that can find a balance point among fairness, locality, and load-balancing. Our extensive evaluation demonstrates the superior performance of DLPM and D$^2$LPM in ensuring fairness while maintaining high throughput (up to 2.87$\times$ higher than VTC) and low per-client (up to 7.18$\times$ lower than state-of-the-art distributed LLM serving system) latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14312v1-abstract-full').style.display = 'none'; document.getElementById('2501.14312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12407">arXiv:2501.12407</a> <span> [<a href="https://arxiv.org/pdf/2501.12407">pdf</a>, <a href="https://arxiv.org/format/2501.12407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Streaming Batch Model for Efficient and Fault-Tolerant Heterogeneous Execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F+S">Frank Sifei Luan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R+Y">Ron Yifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Charlotte Lin</a>, <a href="/search/cs?searchtype=author&query=Kamsetty%2C+A">Amog Kamsetty</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+C">Cheng Su</a>, <a href="/search/cs?searchtype=author&query=Veeramani%2C+B">Balaji Veeramani</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Scott Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">SangBin Cho</a>, <a href="/search/cs?searchtype=author&query=Zinzow%2C+C">Clark Zinzow</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+E">Eric Liang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Stephanie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12407v4-abstract-short" style="display: inline;"> While ML model training and inference are both GPU-intensive, CPU-based data processing is often the bottleneck. Distributed data processing systems based on the batch or stream processing models assume homogeneous resource requirements. They excel at CPU-based computation but either under-utilize heterogeneous resources or impose high overheads on failure and reconfiguration. We introduce the str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12407v4-abstract-full').style.display = 'inline'; document.getElementById('2501.12407v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12407v4-abstract-full" style="display: none;"> While ML model training and inference are both GPU-intensive, CPU-based data processing is often the bottleneck. Distributed data processing systems based on the batch or stream processing models assume homogeneous resource requirements. They excel at CPU-based computation but either under-utilize heterogeneous resources or impose high overheads on failure and reconfiguration. We introduce the streaming batch model, a hybrid of the two models that enables efficient and fault-tolerant heterogeneous execution. The key idea is to execute one partition at a time to allow lineage-based recovery with dynamic resource allocation. This enables memory-efficient pipelining across heterogeneous resources, similar to stream processing, but also offers the elasticity and fault tolerance properties of batch processing. We present Ray Data, an implementation of the streaming batch model that improves throughput on heterogeneous batch inference pipelines by 3--8$\times$ compared to traditional batch and stream processing systems. When training Stable Diffusion, Ray Data matches the throughput of single-node ML data loaders while additionally leveraging distributed heterogeneous clusters to further improve training throughput by 31%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12407v4-abstract-full').style.display = 'none'; document.getElementById('2501.12407v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07493">arXiv:2501.07493</a> <span> [<a href="https://arxiv.org/pdf/2501.07493">pdf</a>, <a href="https://arxiv.org/format/2501.07493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Exploring and Mitigating Adversarial Manipulation of Voting-Based Leaderboards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Nasr%2C+M">Milad Nasr</a>, <a href="/search/cs?searchtype=author&query=Angelopoulos%2C+A">Anastasios Angelopoulos</a>, <a href="/search/cs?searchtype=author&query=Carlini%2C+N">Nicholas Carlini</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Choquette-Choo%2C+C+A">Christopher A. Choquette-Choo</a>, <a href="/search/cs?searchtype=author&query=Ippolito%2C+D">Daphne Ippolito</a>, <a href="/search/cs?searchtype=author&query=Jagielski%2C+M">Matthew Jagielski</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Katherine Lee</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K+Z">Ken Ziyu Liu</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Tramer%2C+F">Florian Tramer</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07493v1-abstract-short" style="display: inline;"> It is now common to evaluate Large Language Models (LLMs) by having humans manually vote to evaluate model outputs, in contrast to typical benchmarks that evaluate knowledge or skill at some particular task. Chatbot Arena, the most popular benchmark of this type, ranks models by asking users to select the better response between two randomly selected models (without revealing which model was respo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07493v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07493v1-abstract-full" style="display: none;"> It is now common to evaluate Large Language Models (LLMs) by having humans manually vote to evaluate model outputs, in contrast to typical benchmarks that evaluate knowledge or skill at some particular task. Chatbot Arena, the most popular benchmark of this type, ranks models by asking users to select the better response between two randomly selected models (without revealing which model was responsible for the generations). These platforms are widely trusted as a fair and accurate measure of LLM capabilities. In this paper, we show that if bot protection and other defenses are not implemented, these voting-based benchmarks are potentially vulnerable to adversarial manipulation. Specifically, we show that an attacker can alter the leaderboard (to promote their favorite model or demote competitors) at the cost of roughly a thousand votes (verified in a simulated, offline version of Chatbot Arena). Our attack consists of two steps: first, we show how an attacker can determine which model was used to generate a given reply with more than $95\%$ accuracy; and then, the attacker can use this information to consistently vote for (or against) a target model. Working with the Chatbot Arena developers, we identify, propose, and implement mitigations to improve the robustness of Chatbot Arena against adversarial manipulation, which, based on our analysis, substantially increases the cost of such attacks. Some of these defenses were present before our collaboration, such as bot protection with Cloudflare, malicious user detection, and rate limiting. Others, including reCAPTCHA and login are being integrated to strengthen the security in Chatbot Arena. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07493v1-abstract-full').style.display = 'none'; document.getElementById('2501.07493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20221">arXiv:2412.20221</a> <span> [<a href="https://arxiv.org/pdf/2412.20221">pdf</a>, <a href="https://arxiv.org/format/2412.20221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696348.3696858">10.1145/3696348.3696858 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Revisiting Cache Freshness for Emerging Real-Time Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+R">Rishabh Iyer</a>, <a href="/search/cs?searchtype=author&query=Shenker%2C+S">Scott Shenker</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20221v1-abstract-short" style="display: inline;"> Caching is widely used in industry to improve application performance by reducing data-access latency and taking the load off the backend infrastructure. TTLs have become the de-facto mechanism used to keep cached data reasonably fresh (i.e., not too out of date with the backend). However, the emergence of real-time applications requires tighter data freshness, which is impractical to achieve with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20221v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20221v1-abstract-full" style="display: none;"> Caching is widely used in industry to improve application performance by reducing data-access latency and taking the load off the backend infrastructure. TTLs have become the de-facto mechanism used to keep cached data reasonably fresh (i.e., not too out of date with the backend). However, the emergence of real-time applications requires tighter data freshness, which is impractical to achieve with TTLs. We discuss why this is the case, and propose a simple yet effective adaptive policy to achieve the desired freshness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20221v1-abstract-full').style.display = 'none'; document.getElementById('2412.20221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">HotNets '24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18407">arXiv:2412.18407</a> <span> [<a href="https://arxiv.org/pdf/2412.18407">pdf</a>, <a href="https://arxiv.org/format/2412.18407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Statistical Framework for Ranking LLM-Based Chatbots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ameli%2C+S">Siavash Ameli</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Siyuan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Mahoney%2C+M+W">Michael W. Mahoney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18407v1-abstract-short" style="display: inline;"> Large language models (LLMs) have transformed natural language processing, with frameworks like Chatbot Arena providing pioneering platforms for evaluating these models. By facilitating millions of pairwise comparisons based on human judgments, Chatbot Arena has become a cornerstone in LLM evaluation, offering rich datasets for ranking models in open-ended conversational tasks. Building upon this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18407v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18407v1-abstract-full" style="display: none;"> Large language models (LLMs) have transformed natural language processing, with frameworks like Chatbot Arena providing pioneering platforms for evaluating these models. By facilitating millions of pairwise comparisons based on human judgments, Chatbot Arena has become a cornerstone in LLM evaluation, offering rich datasets for ranking models in open-ended conversational tasks. Building upon this foundation, we propose a statistical framework that incorporates key advancements to address specific challenges in pairwise comparison analysis. First, we introduce a factored tie model that enhances the ability to handle ties -- an integral aspect of human-judged comparisons -- significantly improving the model's fit to observed data. Second, we extend the framework to model covariance between competitors, enabling deeper insights into performance relationships and facilitating intuitive groupings into performance tiers. Third, we resolve optimization challenges arising from parameter non-uniqueness by introducing novel constraints, ensuring stable and interpretable parameter estimation. Through rigorous evaluation and extensive experimentation, our framework demonstrates substantial improvements over existing methods in modeling pairwise comparison data. To support reproducibility and practical adoption, we release leaderbot, an open-source Python package implementing our models and analyses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18407v1-abstract-full').style.display = 'none'; document.getElementById('2412.18407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14468">arXiv:2412.14468</a> <span> [<a href="https://arxiv.org/pdf/2412.14468">pdf</a>, <a href="https://arxiv.org/format/2412.14468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HashAttention: Semantic Sparsity for Faster Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Desai%2C+A">Aditya Desai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Cuadron%2C+A">Alejandro Cuadron</a>, <a href="/search/cs?searchtype=author&query=Klimovic%2C+A">Ana Klimovic</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14468v1-abstract-short" style="display: inline;"> Utilizing longer contexts is increasingly essential to power better AI systems. However, the cost of attending to long contexts is high due to the involved softmax computation. While the scaled dot-product attention (SDPA) exhibits token sparsity, with only a few pivotal tokens significantly contributing to attention, leveraging this sparsity effectively remains an open challenge. Previous methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14468v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14468v1-abstract-full" style="display: none;"> Utilizing longer contexts is increasingly essential to power better AI systems. However, the cost of attending to long contexts is high due to the involved softmax computation. While the scaled dot-product attention (SDPA) exhibits token sparsity, with only a few pivotal tokens significantly contributing to attention, leveraging this sparsity effectively remains an open challenge. Previous methods either suffer from model degradation or require considerable additional resources. We propose HashAttention --a principled approach casting pivotal token identification as a recommendation problem. Given a query, HashAttention encodes keys and queries in Hamming space capturing the required semantic similarity using learned mapping functions. HashAttention efficiently identifies pivotal tokens for a given query in this Hamming space using bitwise operations, and only these pivotal tokens are used for attention computation, significantly improving overall attention efficiency. HashAttention can reduce the number of tokens used by a factor of $1/32\times$ for the Llama-3.1-8B model with LongBench, keeping average quality loss within 0.6 points, while using only 32 bits per token auxiliary memory. At $32\times$ sparsity, HashAttention is $3{-}6\times$ faster than LightLLM and $2.5{-}4.5\times$ faster than gpt-fast on Nvidia-L4 GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14468v1-abstract-full').style.display = 'none'; document.getElementById('2412.14468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08687">arXiv:2412.08687</a> <span> [<a href="https://arxiv.org/pdf/2412.08687">pdf</a>, <a href="https://arxiv.org/format/2412.08687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VisionArena: 230K Real World User-VLM Conversations with Preference Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+C">Christopher Chou</a>, <a href="/search/cs?searchtype=author&query=Dunlap%2C+L">Lisa Dunlap</a>, <a href="/search/cs?searchtype=author&query=Mashita%2C+K">Koki Mashita</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+K">Krishna Mandal</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08687v2-abstract-short" style="display: inline;"> With the growing adoption and capabilities of vision-language models (VLMs) comes the need for benchmarks that capture authentic user-VLM interactions. In response, we create VisionArena, a dataset of 230K real-world conversations between users and VLMs. Collected from Chatbot Arena - an open-source platform where users interact with VLMs and submit preference votes - VisionArena spans 73K unique… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08687v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08687v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08687v2-abstract-full" style="display: none;"> With the growing adoption and capabilities of vision-language models (VLMs) comes the need for benchmarks that capture authentic user-VLM interactions. In response, we create VisionArena, a dataset of 230K real-world conversations between users and VLMs. Collected from Chatbot Arena - an open-source platform where users interact with VLMs and submit preference votes - VisionArena spans 73K unique users, 45 VLMs, and 138 languages. Our dataset contains three subsets: VisionArena-Chat, 200k single and multi-turn conversations between a user and a VLM; VisionArena-Battle, 30K conversations comparing two anonymous VLMs with user preference votes; and VisionArena-Bench, an automatic benchmark of 500 diverse user prompts that efficiently approximate the live Chatbot Arena model rankings. Additionally, we highlight the types of question asked by users, the influence of response style on preference, and areas where models often fail. We find open-ended tasks like captioning and humor are highly style-dependent, and current VLMs struggle with spatial reasoning and planning tasks. Lastly, we show finetuning the same base model on VisionArena-Chat outperforms Llava-Instruct-158K, with a 17-point gain on MMMU and a 46-point gain on the WildVision benchmark. Dataset at https://huggingface.co/lmarena-ai <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08687v2-abstract-full').style.display = 'none'; document.getElementById('2412.08687v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06394">arXiv:2412.06394</a> <span> [<a href="https://arxiv.org/pdf/2412.06394">pdf</a>, <a href="https://arxiv.org/format/2412.06394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GameArena: Evaluating LLM Reasoning through Live Computer Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lanxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiyu Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+A">Anze Xie</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haojian Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06394v5-abstract-short" style="display: inline;"> Evaluating the reasoning abilities of large language models (LLMs) is challenging. Existing benchmarks often depend on static datasets, which are vulnerable to data contamination and may get saturated over time, or on binary live human feedback that conflates reasoning with other abilities. As the most prominent dynamic benchmark, Chatbot Arena evaluates open-ended questions in real-world settings… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06394v5-abstract-full').style.display = 'inline'; document.getElementById('2412.06394v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06394v5-abstract-full" style="display: none;"> Evaluating the reasoning abilities of large language models (LLMs) is challenging. Existing benchmarks often depend on static datasets, which are vulnerable to data contamination and may get saturated over time, or on binary live human feedback that conflates reasoning with other abilities. As the most prominent dynamic benchmark, Chatbot Arena evaluates open-ended questions in real-world settings, but lacks the granularity in assessing specific reasoning capabilities. We introduce GameArena, a dynamic benchmark designed to evaluate LLM reasoning capabilities through interactive gameplay with humans. GameArena consists of three games designed to test specific reasoning capabilities (e.g., deductive and inductive reasoning), while keeping participants entertained and engaged. We analyze the gaming data retrospectively to uncover the underlying reasoning processes of LLMs and measure their fine-grained reasoning capabilities. We collect over 2000 game sessions and provide detailed assessments of various reasoning capabilities for five state-of-the-art LLMs. Our user study with 100 participants suggests that GameArena improves user engagement compared to Chatbot Arena. For the first time, GameArena enables the collection of step-by-step LLM reasoning data in the wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06394v5-abstract-full').style.display = 'none'; document.getElementById('2412.06394v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05408">arXiv:2412.05408</a> <span> [<a href="https://arxiv.org/pdf/2412.05408">pdf</a>, <a href="https://arxiv.org/format/2412.05408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> FogROS2-FT: Fault Tolerant Cloud Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kaiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Hari%2C+K">Kush Hari</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+T">Trinity Chung</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Michael Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+N">Nan Tian</a>, <a href="/search/cs?searchtype=author&query=Juette%2C+C">Christian Juette</a>, <a href="/search/cs?searchtype=author&query=Ichnowski%2C+J">Jeffrey Ichnowski</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+L">Liu Ren</a>, <a href="/search/cs?searchtype=author&query=Kubiatowicz%2C+J">John Kubiatowicz</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05408v1-abstract-short" style="display: inline;"> Cloud robotics enables robots to offload complex computational tasks to cloud servers for performance and ease of management. However, cloud compute can be costly, cloud services can suffer occasional downtime, and connectivity between the robot and cloud can be prone to variations in network Quality-of-Service (QoS). We present FogROS2-FT (Fault Tolerant) to mitigate these issues by introducing a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05408v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05408v1-abstract-full" style="display: none;"> Cloud robotics enables robots to offload complex computational tasks to cloud servers for performance and ease of management. However, cloud compute can be costly, cloud services can suffer occasional downtime, and connectivity between the robot and cloud can be prone to variations in network Quality-of-Service (QoS). We present FogROS2-FT (Fault Tolerant) to mitigate these issues by introducing a multi-cloud extension that automatically replicates independent stateless robotic services, routes requests to these replicas, and directs the first response back. With replication, robots can still benefit from cloud computations even when a cloud service provider is down or there is low QoS. Additionally, many cloud computing providers offer low-cost spot computing instances that may shutdown unpredictably. Normally, these low-cost instances would be inappropriate for cloud robotics, but the fault tolerance nature of FogROS2-FT allows them to be used reliably. We demonstrate FogROS2-FT fault tolerance capabilities in 3 cloud-robotics scenarios in simulation (visual object detection, semantic segmentation, motion planning) and 1 physical robot experiment (scan-pick-and-place). Running on the same hardware specification, FogROS2-FT achieves motion planning with up to 2.2x cost reduction and up to a 5.53x reduction on 99 Percentile (P99) long-tail latency. FogROS2-FT reduces the P99 long-tail latency of object detection and semantic segmentation by 2.0x and 2.1x, respectively, under network slowdown and resource contention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05408v1-abstract-full').style.display = 'none'; document.getElementById('2412.05408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/RSJ International Conference on Intelligent Robots and Systems 2024 Best Paper Finalist</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05299">arXiv:2412.05299</a> <span> [<a href="https://arxiv.org/pdf/2412.05299">pdf</a>, <a href="https://arxiv.org/format/2412.05299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Specifications: The missing link to making the development of LLM systems an engineering discipline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J">Joseph Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a>, <a href="/search/cs?searchtype=author&query=Sen%2C+K">Koushik Sen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Angelopoulos%2C+A">Anastasios Angelopoulos</a>, <a href="/search/cs?searchtype=author&query=Patil%2C+S+G">Shishir G. Patil</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingjiao Chen</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+J+Q">Jared Q. Davis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05299v2-abstract-short" style="display: inline;"> Despite the significant strides made by generative AI in just a few short years, its future progress is constrained by the challenge of building modular and robust systems. This capability has been a cornerstone of past technological revolutions, which relied on combining components to create increasingly sophisticated and reliable systems. Cars, airplanes, computers, and software consist of compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05299v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05299v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05299v2-abstract-full" style="display: none;"> Despite the significant strides made by generative AI in just a few short years, its future progress is constrained by the challenge of building modular and robust systems. This capability has been a cornerstone of past technological revolutions, which relied on combining components to create increasingly sophisticated and reliable systems. Cars, airplanes, computers, and software consist of components-such as engines, wheels, CPUs, and libraries-that can be assembled, debugged, and replaced. A key tool for building such reliable and modular systems is specification: the precise description of the expected behavior, inputs, and outputs of each component. However, the generality of LLMs and the inherent ambiguity of natural language make defining specifications for LLM-based components (e.g., agents) both a challenging and urgent problem. In this paper, we discuss the progress the field has made so far-through advances like structured outputs, process supervision, and test-time compute-and outline several future directions for research to enable the development of modular and reliable LLM-based systems through improved specifications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05299v2-abstract-full').style.display = 'none'; document.getElementById('2412.05299v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16102">arXiv:2411.16102</a> <span> [<a href="https://arxiv.org/pdf/2411.16102">pdf</a>, <a href="https://arxiv.org/format/2411.16102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BlendServe: Optimizing Offline Inference for Auto-regressive Large Models with Resource-aware Batching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilong Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lianmin Zheng</a>, <a href="/search/cs?searchtype=author&query=Kasikci%2C+B">Baris Kasikci</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+J">Jiarong Xing</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16102v1-abstract-short" style="display: inline;"> Offline batch inference, which leverages the flexibility of request batching to achieve higher throughput and lower costs, is becoming more popular for latency-insensitive applications. Meanwhile, recent progress in model capability and modality makes requests more diverse in compute and memory demands, creating unique opportunities for throughput improvement by resource overlapping. However, a re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16102v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16102v1-abstract-full" style="display: none;"> Offline batch inference, which leverages the flexibility of request batching to achieve higher throughput and lower costs, is becoming more popular for latency-insensitive applications. Meanwhile, recent progress in model capability and modality makes requests more diverse in compute and memory demands, creating unique opportunities for throughput improvement by resource overlapping. However, a request schedule that maximizes resource overlapping can conflict with the schedule that maximizes prefix sharing, a widely-used performance optimization, causing sub-optimal inference throughput. We present BlendServe, a system that maximizes resource utilization of offline batch inference by combining the benefits of resource overlapping and prefix sharing using a resource-aware prefix tree. BlendServe exploits the relaxed latency requirements in offline batch inference to reorder and overlap requests with varied resource demands while ensuring high prefix sharing. We evaluate BlendServe on a variety of synthetic multi-modal workloads and show that it provides up to $1.44\times$ throughput boost compared to widely-used industry standards, vLLM and SGLang. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16102v1-abstract-full').style.display = 'none'; document.getElementById('2411.16102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11217">arXiv:2411.11217</a> <span> [<a href="https://arxiv.org/pdf/2411.11217">pdf</a>, <a href="https://arxiv.org/format/2411.11217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoE-Lightning: High-Throughput MoE Inference on Memory-constrained GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Griggs%2C+T">Tyler Griggs</a>, <a href="/search/cs?searchtype=author&query=Schafhalter%2C+P">Peter Schafhalter</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Y">Ying Sheng</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11217v1-abstract-short" style="display: inline;"> Efficient deployment of large language models, particularly Mixture of Experts (MoE), on resource-constrained platforms presents significant challenges, especially in terms of computational efficiency and memory utilization. The MoE architecture, renowned for its ability to increase model capacity without a proportional increase in inference cost, greatly reduces the token generation latency compa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11217v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11217v1-abstract-full" style="display: none;"> Efficient deployment of large language models, particularly Mixture of Experts (MoE), on resource-constrained platforms presents significant challenges, especially in terms of computational efficiency and memory utilization. The MoE architecture, renowned for its ability to increase model capacity without a proportional increase in inference cost, greatly reduces the token generation latency compared with dense models. However, the large model size makes MoE models inaccessible to individuals without high-end GPUs. In this paper, we propose a high-throughput MoE batch inference system, that significantly outperforms past work. MoE-Lightning introduces a novel CPU-GPU-I/O pipelining schedule, CGOPipe, with paged weights to achieve high resource utilization, and a performance model, HRM, based on a Hierarchical Roofline Model we introduce to help find policies with higher throughput than existing systems. MoE-Lightning can achieve up to 10.3x higher throughput than state-of-the-art offloading-enabled LLM inference systems for Mixtral 8x7B on a single T4 GPU (16GB). When the theoretical system throughput is bounded by the GPU memory, MoE-Lightning can reach the throughput upper bound with 2-3x less CPU memory, significantly increasing resource utilization. MoE-Lightning also supports efficient batch inference for much larger MoEs (e.g., Mixtral 8x22B and DBRX) on multiple low-cost GPUs (e.g., 2-4 T4). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11217v1-abstract-full').style.display = 'none'; document.getElementById('2411.11217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09317">arXiv:2411.09317</a> <span> [<a href="https://arxiv.org/pdf/2411.09317">pdf</a>, <a href="https://arxiv.org/format/2411.09317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Pie: Pooling CPU Memory for LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xiangxi Mo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09317v1-abstract-short" style="display: inline;"> The rapid growth of LLMs has revolutionized natural language processing and AI analysis, but their increasing size and memory demands present significant challenges. A common solution is to spill over to CPU memory; however, traditional GPU-CPU memory swapping often results in higher latency and lower throughput. This paper introduces Pie, an LLM inference framework that addresses these challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09317v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09317v1-abstract-full" style="display: none;"> The rapid growth of LLMs has revolutionized natural language processing and AI analysis, but their increasing size and memory demands present significant challenges. A common solution is to spill over to CPU memory; however, traditional GPU-CPU memory swapping often results in higher latency and lower throughput. This paper introduces Pie, an LLM inference framework that addresses these challenges with performance-transparent swapping and adaptive expansion. By leveraging predictable memory access patterns and the high bandwidth of modern hardware like the NVIDIA GH200 Grace Hopper Superchip, Pie enables concurrent data swapping without affecting foreground computation, expanding effective memory without added latency. Adaptive expansion dynamically adjusts CPU memory allocation based on real-time information, optimizing memory usage and performance under varying conditions. Pie maintains low computation latency, high throughput, and high elasticity. Our experimental evaluation demonstrates that Pie achieves optimal swapping policy during cache warmup and effectively balances increased memory capacity with negligible impact on computation. With its extended capacity, Pie outperforms vLLM by up to 1.9X in throughput and 2X in latency. Additionally, Pie can reduce GPU memory usage by up to 1.67X while maintaining the same performance. Compared to FlexGen, an offline profiling-based swapping solution, Pie achieves magnitudes lower latency and 9.4X higher throughput. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09317v1-abstract-full').style.display = 'none'; document.getElementById('2411.09317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01438">arXiv:2411.01438</a> <span> [<a href="https://arxiv.org/pdf/2411.01438">pdf</a>, <a href="https://arxiv.org/format/2411.01438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SkyServe: Serving AI Models across Regions and Clouds with Spot Instances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhanghao Wu</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Griggs%2C+T">Tyler Griggs</a>, <a href="/search/cs?searchtype=author&query=Bhardwaj%2C+R">Romil Bhardwaj</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shenker%2C+S">Scott Shenker</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01438v1-abstract-short" style="display: inline;"> Recent years have witnessed an explosive growth of AI models. The high cost of hosting AI services on GPUs and their demanding service requirements, make it timely and challenging to lower service costs and guarantee service quality. While spot instances have long been offered with a large discount, spot preemptions have discouraged users from using them to host model replicas when serving AI mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01438v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01438v1-abstract-full" style="display: none;"> Recent years have witnessed an explosive growth of AI models. The high cost of hosting AI services on GPUs and their demanding service requirements, make it timely and challenging to lower service costs and guarantee service quality. While spot instances have long been offered with a large discount, spot preemptions have discouraged users from using them to host model replicas when serving AI models. To address this, we introduce SkyServe, a system that efficiently serves AI models over a mixture of spot and on-demand replicas across regions and clouds. SkyServe intelligently spreads spot replicas across different failure domains (e.g., regions or clouds) to improve availability and reduce correlated preemptions, overprovisions cheap spot replicas than required as a safeguard against possible preemptions, and dynamically falls back to on-demand replicas when spot replicas become unavailable. We compare SkyServe with both research and production systems on real AI workloads: SkyServe reduces cost by up to 44% while achieving high resource availability compared to using on-demand replicas. Additionally, SkyServe improves P50, P90, and P99 latency by up to 2.6x, 3.1x, 2.7x compared to other research and production systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01438v1-abstract-full').style.display = 'none'; document.getElementById('2411.01438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01142">arXiv:2411.01142</a> <span> [<a href="https://arxiv.org/pdf/2411.01142">pdf</a>, <a href="https://arxiv.org/format/2411.01142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NEO: Saving GPU Memory Crisis with CPU Offloading for Online LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xuanlin Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Minlan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01142v1-abstract-short" style="display: inline;"> Online LLM inference powers many exciting applications such as intelligent chatbots and autonomous agents. Modern LLM inference engines widely rely on request batching to improve inference throughput, aiming to make it cost-efficient when running on expensive GPU accelerators. However, the limited GPU memory has largely limited the batch size achieved in practice, leaving significant GPU compute r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01142v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01142v1-abstract-full" style="display: none;"> Online LLM inference powers many exciting applications such as intelligent chatbots and autonomous agents. Modern LLM inference engines widely rely on request batching to improve inference throughput, aiming to make it cost-efficient when running on expensive GPU accelerators. However, the limited GPU memory has largely limited the batch size achieved in practice, leaving significant GPU compute resources wasted. We present NEO, an online LLM inference system that offloads part of attention compute and KV cache states from the GPU to the local host CPU, effectively increasing the GPU batch size and thus inference throughput. To this end, NEO proposes asymmetric GPU-CPU pipelining and load-aware scheduling to balance GPU and CPU loads and fully utilize their compute and memory resources. We evaluate NEO on a wide range of workloads (i.e., code generation, text summarization), GPUs (i.e., T4, A10G, H100), and LLM models (i.e., 7B, 8B, 70B). NEO achieves up to 7.5$\times$, 26%, and 14% higher throughput compared to GPU-only approach on T4, A10G, and H100 GPUs, respectively, while maintaining the same latency; with more powerful CPUs, NEO achieves up to 79.3% throughput gain on A10G GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01142v1-abstract-full').style.display = 'none'; document.getElementById('2411.01142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16227">arXiv:2410.16227</a> <span> [<a href="https://arxiv.org/pdf/2410.16227">pdf</a>, <a href="https://arxiv.org/format/2410.16227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Managing Bandwidth: The Key to Cloud-Assisted Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Krentsel%2C+A">Alexander Krentsel</a>, <a href="/search/cs?searchtype=author&query=Schafhalter%2C+P">Peter Schafhalter</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Ratnasamy%2C+S">Sylvia Ratnasamy</a>, <a href="/search/cs?searchtype=author&query=Shenker%2C+S">Scott Shenker</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16227v1-abstract-short" style="display: inline;"> Prevailing wisdom asserts that one cannot rely on the cloud for critical real-time control systems like self-driving cars. We argue that we can, and must. Following the trends of increasing model sizes, improvements in hardware, and evolving mobile networks, we identify an opportunity to offload parts of time-sensitive and latency-critical compute to the cloud. Doing so requires carefully allocati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16227v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16227v1-abstract-full" style="display: none;"> Prevailing wisdom asserts that one cannot rely on the cloud for critical real-time control systems like self-driving cars. We argue that we can, and must. Following the trends of increasing model sizes, improvements in hardware, and evolving mobile networks, we identify an opportunity to offload parts of time-sensitive and latency-critical compute to the cloud. Doing so requires carefully allocating bandwidth to meet strict latency SLOs, while maximizing benefit to the car. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16227v1-abstract-full').style.display = 'none'; document.getElementById('2410.16227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14872">arXiv:2410.14872</a> <span> [<a href="https://arxiv.org/pdf/2410.14872">pdf</a>, <a href="https://arxiv.org/format/2410.14872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How to Evaluate Reward Models for RLHF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Frick%2C+E">Evan Frick</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianle Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Connor Chen</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Angelopoulos%2C+A+N">Anastasios N. Angelopoulos</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jiantao Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Banghua Zhu</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14872v2-abstract-short" style="display: inline;"> We introduce a new benchmark for reward models that quantifies their ability to produce strong language models through RLHF (Reinforcement Learning from Human Feedback). The gold-standard approach is to run a full RLHF training pipeline and directly probe downstream LLM performance. However, this process is prohibitively expensive. To address this, we build a predictive model of downstream LLM per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14872v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14872v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14872v2-abstract-full" style="display: none;"> We introduce a new benchmark for reward models that quantifies their ability to produce strong language models through RLHF (Reinforcement Learning from Human Feedback). The gold-standard approach is to run a full RLHF training pipeline and directly probe downstream LLM performance. However, this process is prohibitively expensive. To address this, we build a predictive model of downstream LLM performance by evaluating the reward model on proxy tasks. These proxy tasks consist of a large-scale human preference and a verifiable correctness preference dataset, in which we measure 12 metrics across 12 domains. To investigate which reward model metrics are most correlated to gold-standard RLHF outcomes, we launch an end-to-end RLHF experiment on a large-scale crowdsourced human preference platform to view real reward model downstream performance as ground truth. Ultimately, we compile our data and findings into Preference Proxy Evaluations (PPE), the first reward model benchmark explicitly linked to post-RLHF real-world human preference performance, which we open-source for public use and further development. Our code and evaluations can be found at https://github.com/lmarena/PPE . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14872v2-abstract-full').style.display = 'none'; document.getElementById('2410.14872v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12784">arXiv:2410.12784</a> <span> [<a href="https://arxiv.org/pdf/2410.12784">pdf</a>, <a href="https://arxiv.org/format/2410.12784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> JudgeBench: A Benchmark for Evaluating LLM-based Judges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+S">Sijun Tan</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Siyuan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Montgomery%2C+K">Kyle Montgomery</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W+Y">William Y. Tang</a>, <a href="/search/cs?searchtype=author&query=Cuadron%2C+A">Alejandro Cuadron</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenguang Wang</a>, <a href="/search/cs?searchtype=author&query=Popa%2C+R+A">Raluca Ada Popa</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12784v1-abstract-short" style="display: inline;"> LLM-based judges have emerged as a scalable alternative to human evaluation and are increasingly used to assess, compare, and improve models. However, the reliability of LLM-based judges themselves is rarely scrutinized. As LLMs become more advanced, their responses grow more sophisticated, requiring stronger judges to evaluate them. Existing benchmarks primarily focus on a judge's alignment with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12784v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12784v1-abstract-full" style="display: none;"> LLM-based judges have emerged as a scalable alternative to human evaluation and are increasingly used to assess, compare, and improve models. However, the reliability of LLM-based judges themselves is rarely scrutinized. As LLMs become more advanced, their responses grow more sophisticated, requiring stronger judges to evaluate them. Existing benchmarks primarily focus on a judge's alignment with human preferences, but often fail to account for more challenging tasks where crowdsourced human preference is a poor indicator of factual and logical correctness. To address this, we propose a novel evaluation framework to objectively evaluate LLM-based judges. Based on this framework, we propose JudgeBench, a benchmark for evaluating LLM-based judges on challenging response pairs spanning knowledge, reasoning, math, and coding. JudgeBench leverages a novel pipeline for converting existing difficult datasets into challenging response pairs with preference labels reflecting objective correctness. Our comprehensive evaluation on a collection of prompted judges, fine-tuned judges, multi-agent judges, and reward models shows that JudgeBench poses a significantly greater challenge than previous benchmarks, with many strong models (e.g., GPT-4o) performing just slightly better than random guessing. Overall, JudgeBench offers a reliable platform for assessing increasingly advanced LLM-based judges. Data and code are available at https://github.com/ScalerLab/JudgeBench . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12784v1-abstract-full').style.display = 'none'; document.getElementById('2410.12784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15792">arXiv:2408.15792</a> <span> [<a href="https://arxiv.org/pdf/2408.15792">pdf</a>, <a href="https://arxiv.org/format/2408.15792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient LLM Scheduling by Learning to Rank </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yichao Fu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+R">Runlong Su</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+A">Aurick Qiao</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15792v1-abstract-short" style="display: inline;"> In Large Language Model (LLM) inference, the output length of an LLM request is typically regarded as not known a priori. Consequently, most LLM serving systems employ a simple First-come-first-serve (FCFS) scheduling strategy, leading to Head-Of-Line (HOL) blocking and reduced throughput and service quality. In this paper, we reexamine this assumption -- we show that, although predicting the exac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15792v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15792v1-abstract-full" style="display: none;"> In Large Language Model (LLM) inference, the output length of an LLM request is typically regarded as not known a priori. Consequently, most LLM serving systems employ a simple First-come-first-serve (FCFS) scheduling strategy, leading to Head-Of-Line (HOL) blocking and reduced throughput and service quality. In this paper, we reexamine this assumption -- we show that, although predicting the exact generation length of each request is infeasible, it is possible to predict the relative ranks of output lengths in a batch of requests, using learning to rank. The ranking information offers valuable guidance for scheduling requests. Building on this insight, we develop a novel scheduler for LLM inference and serving that can approximate the shortest-job-first (SJF) schedule better than existing approaches. We integrate this scheduler with the state-of-the-art LLM serving system and show significant performance improvement in several important applications: 2.8x lower latency in chatbot serving and 6.5x higher throughput in synthetic data generation. Our code is available at https://github.com/hao-ai-lab/vllm-ltr.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15792v1-abstract-full').style.display = 'none'; document.getElementById('2408.15792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07092">arXiv:2408.07092</a> <span> [<a href="https://arxiv.org/pdf/2408.07092">pdf</a>, <a href="https://arxiv.org/format/2408.07092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Post-Training Sparse Attention with Double Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Y">Ying Sheng</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lianmin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07092v2-abstract-short" style="display: inline;"> The inference process for large language models is slow and memory-intensive, with one of the most critical bottlenecks being excessive Key-Value (KV) cache accesses. This paper introduces "Double Sparsity," a novel post-training sparse attention technique designed to alleviate this bottleneck by reducing KV cache access. Double Sparsity combines token sparsity, which focuses on utilizing only the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07092v2-abstract-full').style.display = 'inline'; document.getElementById('2408.07092v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07092v2-abstract-full" style="display: none;"> The inference process for large language models is slow and memory-intensive, with one of the most critical bottlenecks being excessive Key-Value (KV) cache accesses. This paper introduces "Double Sparsity," a novel post-training sparse attention technique designed to alleviate this bottleneck by reducing KV cache access. Double Sparsity combines token sparsity, which focuses on utilizing only the important tokens for computing self-attention, with channel sparsity, an approach that uses important feature channels for identifying important tokens. Our key insight is that the pattern of channel sparsity is relatively static, allowing us to use offline calibration to make it efficient at runtime, thereby enabling accurate and efficient identification of important tokens. Moreover, this method can be combined with offloading to achieve significant memory usage reduction. Experimental results demonstrate that Double Sparsity can achieve $\frac{1}{16}$ token and channel sparsity with minimal impact on accuracy across various tasks, including wiki-2 perplexity, key-value retrieval, and long context benchmarks with models including Llama-2-7B, Llama-2-70B, and Mixtral-8x7B. It brings up to a 14.1$\times$ acceleration in attention operations and a 1.9$\times$ improvement in end-to-end inference on GPUs. With offloading, it achieves a decoding speed acceleration of 16.3$\times$ compared to state-of-the-art solutions at a sequence length of 256K. Our code is publicly available at https://github.com/andy-yang-1/DoubleSparse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07092v2-abstract-full').style.display = 'none'; document.getElementById('2408.07092v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03561">arXiv:2408.03561</a> <span> [<a href="https://arxiv.org/pdf/2408.03561">pdf</a>, <a href="https://arxiv.org/format/2408.03561">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MPC-Minimized Secure LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rathee%2C+D">Deevashwer Rathee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Popa%2C+R">Raluca Popa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03561v1-abstract-short" style="display: inline;"> Many inference services based on large language models (LLMs) pose a privacy concern, either revealing user prompts to the service or the proprietary weights to the user. Secure inference offers a solution to this problem through secure multi-party computation (MPC), however, it is still impractical for modern LLM workload due to the large overhead imposed by MPC. To address this overhead, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03561v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03561v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03561v1-abstract-full" style="display: none;"> Many inference services based on large language models (LLMs) pose a privacy concern, either revealing user prompts to the service or the proprietary weights to the user. Secure inference offers a solution to this problem through secure multi-party computation (MPC), however, it is still impractical for modern LLM workload due to the large overhead imposed by MPC. To address this overhead, we propose Marill, a framework that adapts LLM fine-tuning to minimize MPC usage during secure inference. Marill introduces high-level architectural changes during fine-tuning that significantly reduce the number of expensive operations needed within MPC during inference, by removing some and relocating others outside MPC without compromising security. As a result, Marill-generated models are more efficient across all secure inference protocols and our approach complements MPC-friendly approximations for such operations. Compared to standard fine-tuning, Marill results in 3.6-11.3x better runtime and 2.4-6.9x better communication during secure inference across various MPC settings, while typically preserving over 90% performance across downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03561v1-abstract-full').style.display = 'none'; document.getElementById('2408.03561v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16831">arXiv:2407.16831</a> <span> [<a href="https://arxiv.org/pdf/2407.16831">pdf</a>, <a href="https://arxiv.org/ps/2407.16831">ps</a>, <a href="https://arxiv.org/format/2407.16831">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Networks of Networks: Complexity Class Principles Applied to Compound AI Systems Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Davis%2C+J+Q">Jared Quincy Davis</a>, <a href="/search/cs?searchtype=author&query=Hanin%2C+B">Boris Hanin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingjiao Chen</a>, <a href="/search/cs?searchtype=author&query=Bailis%2C+P">Peter Bailis</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16831v1-abstract-short" style="display: inline;"> As practitioners seek to surpass the current reliability and quality frontier of monolithic models, Compound AI Systems consisting of many language model inference calls are increasingly employed. In this work, we construct systems, which we call Networks of Networks (NoNs) organized around the distinction between generating a proposed answer and verifying its correctness, a fundamental concept in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16831v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16831v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16831v1-abstract-full" style="display: none;"> As practitioners seek to surpass the current reliability and quality frontier of monolithic models, Compound AI Systems consisting of many language model inference calls are increasingly employed. In this work, we construct systems, which we call Networks of Networks (NoNs) organized around the distinction between generating a proposed answer and verifying its correctness, a fundamental concept in complexity theory that we show empirically extends to Language Models (LMs). We introduce a verifier-based judge NoN with K generators, an instantiation of "best-of-K" or "judge-based" compound AI systems. Through experiments on synthetic tasks such as prime factorization, and core benchmarks such as the MMLU, we demonstrate notable performance gains. For instance, in factoring products of two 3-digit primes, a simple NoN improves accuracy from 3.7\% to 36.6\%. On MMLU, a verifier-based judge construction with only 3 generators boosts accuracy over individual GPT-4-Turbo calls by 2.8\%. Our analysis reveals that these gains are most pronounced in domains where verification is notably easier than generation--a characterization which we believe subsumes many reasoning and procedural knowledge tasks, but doesn't often hold for factual and declarative knowledge-based settings. For mathematical and formal logic reasoning-based subjects of MMLU, we observe a 5-8\% or higher gain, whilst no gain on others such as geography and religion. We provide key takeaways for ML practitioners, including the importance of considering verification complexity, the impact of witness format on verifiability, and a simple test to determine the potential benefit of this NoN approach for a given problem distribution. This work aims to inform future research and practice in the design of compound AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16831v1-abstract-full').style.display = 'none'; document.getElementById('2407.16831v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18665">arXiv:2406.18665</a> <span> [<a href="https://arxiv.org/pdf/2406.18665">pdf</a>, <a href="https://arxiv.org/format/2406.18665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RouteLLM: Learning to Route LLMs with Preference Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ong%2C+I">Isaac Ong</a>, <a href="/search/cs?searchtype=author&query=Almahairi%2C+A">Amjad Almahairi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+V">Vincent Wu</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Kadous%2C+M+W">M Waleed Kadous</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18665v4-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit impressive capabilities across a wide range of tasks, yet the choice of which model to use often involves a trade-off between performance and cost. More powerful models, though effective, come with higher expenses, while less capable models are more cost-effective. To address this dilemma, we propose several efficient router models that dynamically select betwe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18665v4-abstract-full').style.display = 'inline'; document.getElementById('2406.18665v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18665v4-abstract-full" style="display: none;"> Large language models (LLMs) exhibit impressive capabilities across a wide range of tasks, yet the choice of which model to use often involves a trade-off between performance and cost. More powerful models, though effective, come with higher expenses, while less capable models are more cost-effective. To address this dilemma, we propose several efficient router models that dynamically select between a stronger and a weaker LLM during inference, aiming to optimize the balance between cost and response quality. We develop a training framework for these routers leveraging human preference data and data augmentation techniques to enhance performance. Our evaluation on widely-recognized benchmarks shows that our approach significantly reduces costs-by over 2 times in certain cases-without compromising the quality of responses. Interestingly, our router models also demonstrate significant transfer learning capabilities, maintaining their performance even when the strong and weak models are changed at test time. This highlights the potential of these routers to provide a cost-effective yet high-performance solution for deploying LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18665v4-abstract-full').style.display = 'none'; document.getElementById('2406.18665v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14066">arXiv:2406.14066</a> <span> [<a href="https://arxiv.org/pdf/2406.14066">pdf</a>, <a href="https://arxiv.org/format/2406.14066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Speculative Decoding for Serving Large Language Models Using Goodput </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Daniel%2C+C">Cade Daniel</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Langxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+W">Woosuk Kwon</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuohan Li</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xiangxi Mo</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+A">Alvin Cheung</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhijie Deng</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14066v2-abstract-short" style="display: inline;"> Reducing the inference latency of large language models (LLMs) is crucial, and speculative decoding (SD) stands out as one of the most effective techniques. Rather than letting the LLM generate all tokens directly, speculative decoding employs effective proxies to predict potential outputs, which are then verified by the LLM without compromising the generation quality. Yet, deploying SD in real on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14066v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14066v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14066v2-abstract-full" style="display: none;"> Reducing the inference latency of large language models (LLMs) is crucial, and speculative decoding (SD) stands out as one of the most effective techniques. Rather than letting the LLM generate all tokens directly, speculative decoding employs effective proxies to predict potential outputs, which are then verified by the LLM without compromising the generation quality. Yet, deploying SD in real online LLM serving systems (with continuous batching) does not always yield improvement -- under higher request rates or low speculation accuracy, it paradoxically increases latency. Furthermore, there is no best speculation length work for all workloads under different system loads. Based on the observations, we develop a dynamic framework SmartSpec. SmartSpec dynamically determines the best speculation length for each request (from 0, i.e., no speculation, to many tokens) -- hence the associated speculative execution costs -- based on a new metric called goodput, which characterizes the current observed load of the entire system and the speculation accuracy. We show that SmartSpec consistently reduces average request latency by up to 3.2x compared to non-speculative decoding baselines across different sizes of target models, draft models, request rates, and datasets. Moreover, SmartSpec can be applied to different styles of speculative decoding, including traditional, model-based approaches as well as model-free methods like prompt lookup and tree-style decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14066v2-abstract-full').style.display = 'none'; document.getElementById('2406.14066v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11939">arXiv:2406.11939</a> <span> [<a href="https://arxiv.org/pdf/2406.11939">pdf</a>, <a href="https://arxiv.org/format/2406.11939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianle Li</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Frick%2C+E">Evan Frick</a>, <a href="/search/cs?searchtype=author&query=Dunlap%2C+L">Lisa Dunlap</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Banghua Zhu</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11939v2-abstract-short" style="display: inline;"> The rapid evolution of Large Language Models (LLMs) has outpaced the development of model evaluation, highlighting the need for continuous curation of new, challenging benchmarks. However, manual curation of high-quality, human-aligned benchmarks is expensive and time-consuming. To address this, we introduce BenchBuilder, an automated pipeline that leverages LLMs to curate high-quality, open-ended… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11939v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11939v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11939v2-abstract-full" style="display: none;"> The rapid evolution of Large Language Models (LLMs) has outpaced the development of model evaluation, highlighting the need for continuous curation of new, challenging benchmarks. However, manual curation of high-quality, human-aligned benchmarks is expensive and time-consuming. To address this, we introduce BenchBuilder, an automated pipeline that leverages LLMs to curate high-quality, open-ended prompts from large, crowd-sourced datasets, enabling continuous benchmark updates without human in the loop. We apply BenchBuilder to datasets such as Chatbot Arena and WildChat-1M, extracting challenging prompts and utilizing LLM-as-a-Judge for automatic model evaluation. To validate benchmark quality, we propose new metrics to measure a benchmark's alignment with human preferences and ability to separate models. We release Arena-Hard-Auto, a benchmark consisting 500 challenging prompts curated by BenchBuilder. Arena-Hard-Auto provides 3x higher separation of model performances compared to MT-Bench and achieves 98.6% correlation with human preference rankings, all at a cost of $20. Our work sets a new framework for the scalable curation of automated benchmarks from extensive data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11939v2-abstract-full').style.display = 'none'; document.getElementById('2406.11939v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20947">arXiv:2405.20947</a> <span> [<a href="https://arxiv.org/pdf/2405.20947">pdf</a>, <a href="https://arxiv.org/format/2405.20947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OR-Bench: An Over-Refusal Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+J">Justin Cui</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cho-Jui Hsieh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20947v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) require careful safety alignment to prevent malicious outputs. While significant research focuses on mitigating harmful content generation, the enhanced safety often come with the side effect of over-refusal, where LLMs may reject innocuous prompts and become less helpful. Although the issue of over-refusal has been empirically observed, a systematic measurement is cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20947v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20947v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20947v2-abstract-full" style="display: none;"> Large Language Models (LLMs) require careful safety alignment to prevent malicious outputs. While significant research focuses on mitigating harmful content generation, the enhanced safety often come with the side effect of over-refusal, where LLMs may reject innocuous prompts and become less helpful. Although the issue of over-refusal has been empirically observed, a systematic measurement is challenging due to the difficulty of crafting prompts that appear harmful but are benign. This study proposes a novel method for automatically generating large-scale sets of "seemingly toxic prompts" (benign prompts likely rejected by LLMs). Leveraging this technique, we introduce OR-Bench, the first large-scale over-refusal benchmark. OR-Bench comprises 80,000 seemingly toxic prompts across 10 common rejection categories, a subset of around 1,000 hard prompts that are challenging even for state-of-the-art LLMs, and an additional 600 toxic prompts to prevent indiscriminate responses. We then conduct a comprehensive study to measure the over-refusal of 25 popular LLMs across 8 model families. Our datasets are available at https://huggingface.co/datasets/bench-llm/or-bench and the demo can be found at https://huggingface.co/spaces/bench-llm/or-bench. We hope this benchmark can help the community develop better safety aligned models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20947v2-abstract-full').style.display = 'none'; document.getElementById('2405.20947v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">version 2, 10 pages main, 22 pages total</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16714">arXiv:2405.16714</a> <span> [<a href="https://arxiv.org/pdf/2405.16714">pdf</a>, <a href="https://arxiv.org/format/2405.16714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Crafting Interpretable Embeddings by Asking LLMs Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Benara%2C+V">Vinamra Benara</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+C">Chandan Singh</a>, <a href="/search/cs?searchtype=author&query=Morris%2C+J+X">John X. Morris</a>, <a href="/search/cs?searchtype=author&query=Antonello%2C+R">Richard Antonello</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Huth%2C+A+G">Alexander G. Huth</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16714v1-abstract-short" style="display: inline;"> Large language models (LLMs) have rapidly improved text embeddings for a growing array of natural-language processing tasks. However, their opaqueness and proliferation into scientific domains such as neuroscience have created a growing need for interpretability. Here, we ask whether we can obtain interpretable embeddings through LLM prompting. We introduce question-answering embeddings (QA-Emb),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16714v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16714v1-abstract-full" style="display: none;"> Large language models (LLMs) have rapidly improved text embeddings for a growing array of natural-language processing tasks. However, their opaqueness and proliferation into scientific domains such as neuroscience have created a growing need for interpretability. Here, we ask whether we can obtain interpretable embeddings through LLM prompting. We introduce question-answering embeddings (QA-Emb), embeddings where each feature represents an answer to a yes/no question asked to an LLM. Training QA-Emb reduces to selecting a set of underlying questions rather than learning model weights. We use QA-Emb to flexibly generate interpretable models for predicting fMRI voxel responses to language stimuli. QA-Emb significantly outperforms an established interpretable baseline, and does so while requiring very few questions. This paves the way towards building flexible feature spaces that can concretize and evaluate our understanding of semantic brain representations. We additionally find that QA-Emb can be effectively approximated with an efficient model, and we explore broader applications in simple NLP tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16714v1-abstract-full').style.display = 'none'; document.getElementById('2405.16714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18928">arXiv:2404.18928</a> <span> [<a href="https://arxiv.org/pdf/2404.18928">pdf</a>, <a href="https://arxiv.org/format/2404.18928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Stylus: Automatic Adapter Selection for Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+M">Michael Luo</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+J">Justin Wong</a>, <a href="/search/cs?searchtype=author&query=Trabucco%2C+B">Brandon Trabucco</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanping Huang</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Salakhutdinov%2C+R">Ruslan Salakhutdinov</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18928v1-abstract-short" style="display: inline;"> Beyond scaling base models with more data or parameters, fine-tuned adapters provide an alternative way to generate high fidelity, custom images at reduced costs. As such, adapters have been widely adopted by open-source communities, accumulating a database of over 100K adapters-most of which are highly customized with insufficient descriptions. This paper explores the problem of matching the prom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18928v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18928v1-abstract-full" style="display: none;"> Beyond scaling base models with more data or parameters, fine-tuned adapters provide an alternative way to generate high fidelity, custom images at reduced costs. As such, adapters have been widely adopted by open-source communities, accumulating a database of over 100K adapters-most of which are highly customized with insufficient descriptions. This paper explores the problem of matching the prompt to a set of relevant adapters, built on recent work that highlight the performance gains of composing adapters. We introduce Stylus, which efficiently selects and automatically composes task-specific adapters based on a prompt's keywords. Stylus outlines a three-stage approach that first summarizes adapters with improved descriptions and embeddings, retrieves relevant adapters, and then further assembles adapters based on prompts' keywords by checking how well they fit the prompt. To evaluate Stylus, we developed StylusDocs, a curated dataset featuring 75K adapters with pre-computed adapter embeddings. In our evaluation on popular Stable Diffusion checkpoints, Stylus achieves greater CLIP-FID Pareto efficiency and is twice as preferred, with humans and multimodal models as evaluators, over the base model. See stylus-diffusion.github.io for more. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18928v1-abstract-full').style.display = 'none'; document.getElementById('2404.18928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://stylus-diffusion.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14527">arXiv:2404.14527</a> <span> [<a href="https://arxiv.org/pdf/2404.14527">pdf</a>, <a href="https://arxiv.org/format/2404.14527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> M茅lange: Cost Efficient Large Language Model Serving by Exploiting GPU Heterogeneity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Griggs%2C+T">Tyler Griggs</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiaxiang Yu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Doyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+A">Alvin Cheung</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14527v4-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly integrated into many online services, yet they remain cost-prohibitive to deploy due to the requirement of expensive GPU instances. Prior work has addressed the high cost of LLM serving by improving the inference engine, but less attention has been given to selecting the most cost-efficient GPU type(s) for a specific LLM service. There is a large and g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14527v4-abstract-full').style.display = 'inline'; document.getElementById('2404.14527v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14527v4-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly integrated into many online services, yet they remain cost-prohibitive to deploy due to the requirement of expensive GPU instances. Prior work has addressed the high cost of LLM serving by improving the inference engine, but less attention has been given to selecting the most cost-efficient GPU type(s) for a specific LLM service. There is a large and growing landscape of GPU types and, within these options, higher cost does not always lead to increased performance. Instead, through a comprehensive investigation, we find that three key LLM service characteristics (request size, request rate, SLO) strongly influence GPU cost efficiency, and differing GPU types are most cost efficient for differing LLM service settings. As a result, the most cost-efficient allocation for a given service is typically a mix of heterogeneous GPU types. Based on this analysis, we introduce M茅lange, a GPU allocation framework that navigates these diverse LLM service characteristics and heterogeneous GPU option space to automatically and efficiently derive the minimal-cost GPU allocation for a given LLM service. We formulate the GPU allocation task as a cost-aware bin packing problem where GPUs are bins and items are slices of the service workload. Our formulation's constraints account for a service's unique characteristics, allowing M茅lange to be flexible to support diverse service settings and heterogeneity-aware to adapt the GPU allocation to a specific service. Compared to using only a single GPU type, M茅lange reduces deployment costs by up to 77% in conversational settings, 33% in document-based settings, and 51% in a mixed setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14527v4-abstract-full').style.display = 'none'; document.getElementById('2404.14527v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06921">arXiv:2404.06921</a> <span> [<a href="https://arxiv.org/pdf/2404.06921">pdf</a>, <a href="https://arxiv.org/format/2404.06921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GoEX: Perspectives and Designs Towards a Runtime for Autonomous LLM Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Patil%2C+S+G">Shishir G. Patil</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+V">Vivian Fang</a>, <a href="/search/cs?searchtype=author&query=C.%2C+N">Noppapon C.</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Roy Huang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+A">Aaron Hao</a>, <a href="/search/cs?searchtype=author&query=Casado%2C+M">Martin Casado</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Popa%2C+R+A">Raluca Ada Popa</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06921v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are evolving beyond their classical role of providing information within dialogue systems to actively engaging with tools and performing actions on real-world applications and services. Today, humans verify the correctness and appropriateness of the LLM-generated outputs (e.g., code, functions, or actions) before putting them into real-world execution. This poses signi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06921v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06921v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are evolving beyond their classical role of providing information within dialogue systems to actively engaging with tools and performing actions on real-world applications and services. Today, humans verify the correctness and appropriateness of the LLM-generated outputs (e.g., code, functions, or actions) before putting them into real-world execution. This poses significant challenges as code comprehension is well known to be notoriously difficult. In this paper, we study how humans can efficiently collaborate with, delegate to, and supervise autonomous LLMs in the future. We argue that in many cases, "post-facto validation" - verifying the correctness of a proposed action after seeing the output - is much easier than the aforementioned "pre-facto validation" setting. The core concept behind enabling a post-facto validation system is the integration of an intuitive undo feature, and establishing a damage confinement for the LLM-generated actions as effective strategies to mitigate the associated risks. Using this, a human can now either revert the effect of an LLM-generated output or be confident that the potential risk is bounded. We believe this is critical to unlock the potential for LLM agents to interact with applications and services with limited (post-facto) human involvement. We describe the design and implementation of our open-source runtime for executing LLM actions, Gorilla Execution Engine (GoEX), and present open research questions towards realizing the goal of LLMs and applications interacting with each other with minimal human supervision. We release GoEX at https://github.com/ShishirPatil/gorilla/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06921v1-abstract-full').style.display = 'none'; document.getElementById('2404.06921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04500">arXiv:2404.04500</a> <span> [<a href="https://arxiv.org/pdf/2404.04500">pdf</a>, <a href="https://arxiv.org/format/2404.04500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Trustless Audits without Revealing Data or Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Waiwitlikhit%2C+S">Suppakit Waiwitlikhit</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi Sun</a>, <a href="/search/cs?searchtype=author&query=Hashimoto%2C+T">Tatsunori Hashimoto</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+D">Daniel Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04500v1-abstract-short" style="display: inline;"> There is an increasing conflict between business incentives to hide models and data as trade secrets, and the societal need for algorithmic transparency. For example, a rightsholder wishing to know whether their copyrighted works have been used during training must convince the model provider to allow a third party to audit the model and data. Finding a mutually agreeable third party is difficult,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04500v1-abstract-full').style.display = 'inline'; document.getElementById('2404.04500v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04500v1-abstract-full" style="display: none;"> There is an increasing conflict between business incentives to hide models and data as trade secrets, and the societal need for algorithmic transparency. For example, a rightsholder wishing to know whether their copyrighted works have been used during training must convince the model provider to allow a third party to audit the model and data. Finding a mutually agreeable third party is difficult, and the associated costs often make this approach impractical. In this work, we show that it is possible to simultaneously allow model providers to keep their model weights (but not architecture) and data secret while allowing other parties to trustlessly audit model and data properties. We do this by designing a protocol called ZkAudit in which model providers publish cryptographic commitments of datasets and model weights, alongside a zero-knowledge proof (ZKP) certifying that published commitments are derived from training the model. Model providers can then respond to audit requests by privately computing any function F of the dataset (or model) and releasing the output of F alongside another ZKP certifying the correct execution of F. To enable ZkAudit, we develop new methods of computing ZKPs for SGD on modern neural nets for simple recommender systems and image classification models capable of high accuracies on ImageNet. Empirically, we show it is possible to provide trustless audits of DNNs, including copyright, censorship, and counterfactual audits with little to no loss in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04500v1-abstract-full').style.display = 'none'; document.getElementById('2404.04500v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02015">arXiv:2404.02015</a> <span> [<a href="https://arxiv.org/pdf/2404.02015">pdf</a>, <a href="https://arxiv.org/format/2404.02015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> MuxServe: Flexible Spatial-Temporal Multiplexing for Multiple LLM Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jiangfei Duan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+R">Runyu Lu</a>, <a href="/search/cs?searchtype=author&query=Duanmu%2C+H">Haojie Duanmu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuhong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02015v2-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable performance, and organizations are racing to serve LLMs of varying sizes as endpoints for use-cases like chat, programming and search. However, efficiently serving multiple LLMs poses significant challenges for existing approaches due to varying popularity of LLMs. In the paper, we present MuxServe, a flexible spatial-temporal multiplexing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02015v2-abstract-full').style.display = 'inline'; document.getElementById('2404.02015v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02015v2-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable performance, and organizations are racing to serve LLMs of varying sizes as endpoints for use-cases like chat, programming and search. However, efficiently serving multiple LLMs poses significant challenges for existing approaches due to varying popularity of LLMs. In the paper, we present MuxServe, a flexible spatial-temporal multiplexing system for efficient multiple LLM serving. The key insight behind is to colocate LLMs considering their popularity to multiplex memory resources, and leverage the characteristics of prefill and decoding phases to separate and flexibly colocate them to multiplex computation resources. MuxServe formally formulates the multiplexing problem, and proposes a novel placement algorithm and adaptive batch scheduling strategy to identify optimal colocations and maximize utilization. MuxServe designs a unified resource manager to enable flexible and efficient multiplexing. Evaluation results show that MuxServe can achieves up to $1.8\times$ higher throughput or processes $2.9\times$ more requests within $99\%$ SLO attainment. The code is available at: \url{https://github.com/hao-ai-lab/MuxServe}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02015v2-abstract-full').style.display = 'none'; document.getElementById('2404.02015v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13839">arXiv:2403.13839</a> <span> [<a href="https://arxiv.org/pdf/2403.13839">pdf</a>, <a href="https://arxiv.org/format/2403.13839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> depyf: Open the Opaque Box of PyTorch Compiler for Machine Learning Researchers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+K">Kaichao You</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+R">Runsheng Bai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianmin Wang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Long%2C+M">Mingsheng Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13839v1-abstract-short" style="display: inline;"> PyTorch \texttt{2.x} introduces a compiler designed to accelerate deep learning programs. However, for machine learning researchers, adapting to the PyTorch compiler to full potential can be challenging. The compiler operates at the Python bytecode level, making it appear as an opaque box. To address this, we introduce \texttt{depyf}, a tool designed to demystify the inner workings of the PyTorch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13839v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13839v1-abstract-full" style="display: none;"> PyTorch \texttt{2.x} introduces a compiler designed to accelerate deep learning programs. However, for machine learning researchers, adapting to the PyTorch compiler to full potential can be challenging. The compiler operates at the Python bytecode level, making it appear as an opaque box. To address this, we introduce \texttt{depyf}, a tool designed to demystify the inner workings of the PyTorch compiler. \texttt{depyf} decompiles bytecode generated by PyTorch back into equivalent source code, and establishes connections between in-memory code objects and their on-disk source code counterparts. This feature enables users to step through the source code line by line using debuggers, thus enhancing their understanding of the underlying processes. Notably, \texttt{depyf} is non-intrusive and user-friendly, primarily relying on two convenient context managers for its core functionality. The project is \href{https://github.com/thuml/depyf}{ openly available} and is recognized as a \href{https://pytorch.org/ecosystem/}{PyTorch ecosystem project}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13839v1-abstract-full').style.display = 'none'; document.getElementById('2403.13839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10131">arXiv:2403.10131</a> <span> [<a href="https://arxiv.org/pdf/2403.10131">pdf</a>, <a href="https://arxiv.org/format/2403.10131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RAFT: Adapting Language Model to Domain Specific RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Patil%2C+S+G">Shishir G. Patil</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+N">Naman Jain</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10131v2-abstract-short" style="display: inline;"> Pretraining Large Language Models (LLMs) on large corpora of textual data is now a standard paradigm. When using these LLMs for many downstream applications, it is common to additionally bake in new knowledge (e.g., time-critical news, or private domain knowledge) into the pretrained model either through RAG-based-prompting, or fine-tuning. However, the optimal methodology for the model to gain su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10131v2-abstract-full').style.display = 'inline'; document.getElementById('2403.10131v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10131v2-abstract-full" style="display: none;"> Pretraining Large Language Models (LLMs) on large corpora of textual data is now a standard paradigm. When using these LLMs for many downstream applications, it is common to additionally bake in new knowledge (e.g., time-critical news, or private domain knowledge) into the pretrained model either through RAG-based-prompting, or fine-tuning. However, the optimal methodology for the model to gain such new knowledge remains an open question. In this paper, we present Retrieval Augmented FineTuning (RAFT), a training recipe that improves the model's ability to answer questions in a "open-book" in-domain settings. In RAFT, given a question, and a set of retrieved documents, we train the model to ignore those documents that don't help in answering the question, which we call, distractor documents. RAFT accomplishes this by citing verbatim the right sequence from the relevant document that would help answer the question. This coupled with RAFT's chain-of-thought-style response helps improve the model's ability to reason. In domain-specific RAG, RAFT consistently improves the model's performance across PubMed, HotpotQA, and Gorilla datasets, presenting a post-training recipe to improve pre-trained LLMs to in-domain RAG. RAFT's code and demo are open-sourced at github.com/ShishirPatil/gorilla. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10131v2-abstract-full').style.display = 'none'; document.getElementById('2403.10131v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07974">arXiv:2403.07974</a> <span> [<a href="https://arxiv.org/pdf/2403.07974">pdf</a>, <a href="https://arxiv.org/format/2403.07974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+N">Naman Jain</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">King Han</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+A">Alex Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wen-Ding Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F">Fanjia Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sida Wang</a>, <a href="/search/cs?searchtype=author&query=Solar-Lezama%2C+A">Armando Solar-Lezama</a>, <a href="/search/cs?searchtype=author&query=Sen%2C+K">Koushik Sen</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07974v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contaminati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07974v2-abstract-full').style.display = 'inline'; document.getElementById('2403.07974v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07974v2-abstract-full" style="display: none;"> Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our benchmark also focuses on a broader range of code related capabilities, such as self-repair, code execution, and test output prediction, beyond just code generation. Currently, LiveCodeBench hosts four hundred high-quality coding problems that were published between May 2023 and May 2024. We have evaluated 18 base LLMs and 34 instruction-tuned LLMs on LiveCodeBench. We present empirical findings on contamination, holistic performance comparisons, potential overfitting in existing benchmarks as well as individual model comparisons. We will release all prompts and model completions for further community analysis, along with a general toolkit for adding new scenarios and model <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07974v2-abstract-full').style.display = 'none'; document.getElementById('2403.07974v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website - https://livecodebench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05821">arXiv:2403.05821</a> <span> [<a href="https://arxiv.org/pdf/2403.05821">pdf</a>, <a href="https://arxiv.org/format/2403.05821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Optimizing LLM Queries in Relational Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shu Liu</a>, <a href="/search/cs?searchtype=author&query=Biswal%2C+A">Asim Biswal</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+A">Audrey Cheng</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xiangxi Mo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05821v1-abstract-short" style="display: inline;"> Analytical database providers (e.g., Redshift, Databricks, BigQuery) have rapidly added support for invoking Large Language Models (LLMs) through native user-defined functions (UDFs) to help users perform natural language tasks, such as classification, entity extraction, and translation, inside analytical workloads. For instance, an analyst might want to extract customer sentiments on millions of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05821v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05821v1-abstract-full" style="display: none;"> Analytical database providers (e.g., Redshift, Databricks, BigQuery) have rapidly added support for invoking Large Language Models (LLMs) through native user-defined functions (UDFs) to help users perform natural language tasks, such as classification, entity extraction, and translation, inside analytical workloads. For instance, an analyst might want to extract customer sentiments on millions of product reviews. However, LLM inference is highly expensive in both computational and economic terms: for example, an NVIDIA L4 GPU running Llama2-7B can only process 6 KB of text per second. In this paper, we explore how to optimize LLM inference for analytical workloads that invoke LLMs within relational queries. We show that relational queries present novel opportunities for accelerating LLM inference, including reordering rows to maximize key-value (KV) cache reuse within the LLM inference engine, reordering columns within a row to further increase cache reuse, and deduplicating redundant inference requests. We implement these optimizations in Apache Spark, with vLLM as the model serving backend and achieve up to 4.4x improvement in end-to-end latency on a benchmark of diverse LLM-based queries on real datasets. To the best of our knowledge, this is the first work to explicitly address the problem of optimizing LLM invocations within SQL queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05821v1-abstract-full').style.display = 'none'; document.getElementById('2403.05821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04132">arXiv:2403.04132</a> <span> [<a href="https://arxiv.org/pdf/2403.04132">pdf</a>, <a href="https://arxiv.org/format/2403.04132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Lianmin Zheng</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Y">Ying Sheng</a>, <a href="/search/cs?searchtype=author&query=Angelopoulos%2C+A+N">Anastasios Nikolas Angelopoulos</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianle Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Banghua Zhu</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+M">Michael Jordan</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04132v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have unlocked new capabilities and applications; however, evaluating the alignment with human preferences still poses significant challenges. To address this issue, we introduce Chatbot Arena, an open platform for evaluating LLMs based on human preferences. Our methodology employs a pairwise comparison approach and leverages input from a diverse user base through crowd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04132v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04132v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have unlocked new capabilities and applications; however, evaluating the alignment with human preferences still poses significant challenges. To address this issue, we introduce Chatbot Arena, an open platform for evaluating LLMs based on human preferences. Our methodology employs a pairwise comparison approach and leverages input from a diverse user base through crowdsourcing. The platform has been operational for several months, amassing over 240K votes. This paper describes the platform, analyzes the data we have collected so far, and explains the tried-and-true statistical methods we are using for efficient and accurate evaluation and ranking of models. We confirm that the crowdsourced questions are sufficiently diverse and discriminating and that the crowdsourced human votes are in good agreement with those of expert raters. These analyses collectively establish a robust foundation for the credibility of Chatbot Arena. Because of its unique value and openness, Chatbot Arena has emerged as one of the most referenced LLM leaderboards, widely cited by leading LLM developers and companies. Our demo is publicly available at \url{https://chat.lmsys.org}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04132v1-abstract-full').style.display = 'none'; document.getElementById('2403.04132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02419">arXiv:2403.02419</a> <span> [<a href="https://arxiv.org/pdf/2403.02419">pdf</a>, <a href="https://arxiv.org/format/2403.02419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Are More LLM Calls All You Need? Towards Scaling Laws of Compound Inference Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingjiao Chen</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+J+Q">Jared Quincy Davis</a>, <a href="/search/cs?searchtype=author&query=Hanin%2C+B">Boris Hanin</a>, <a href="/search/cs?searchtype=author&query=Bailis%2C+P">Peter Bailis</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Zaharia%2C+M">Matei Zaharia</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">James Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02419v2-abstract-short" style="display: inline;"> Many recent state-of-the-art results in language tasks were achieved using compound systems that perform multiple Language Model (LM) calls and aggregate their responses. However, there is little understanding of how the number of LM calls - e.g., when asking the LM to answer each question multiple times and taking a majority vote - affects such a compound system's performance. In this paper, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02419v2-abstract-full').style.display = 'inline'; document.getElementById('2403.02419v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02419v2-abstract-full" style="display: none;"> Many recent state-of-the-art results in language tasks were achieved using compound systems that perform multiple Language Model (LM) calls and aggregate their responses. However, there is little understanding of how the number of LM calls - e.g., when asking the LM to answer each question multiple times and taking a majority vote - affects such a compound system's performance. In this paper, we initiate the study of scaling properties of compound inference systems. We analyze, theoretically and empirically, how the number of LM calls affects the performance of Vote and Filter-Vote, two of the simplest compound system designs, which aggregate LM responses via majority voting, optionally applying LM filters. We find, surprisingly, that across multiple language tasks, the performance of both Vote and Filter-Vote can first increase but then decrease as a function of the number of LM calls. Our theoretical results suggest that this non-monotonicity is due to the diversity of query difficulties within a task: more LM calls lead to higher performance on "easy" queries, but lower performance on "hard" queries, and non-monotone behavior can emerge when a task contains both types of queries. This insight then allows us to compute, from a small number of samples, the number of LM calls that maximizes system performance, and define an analytical scaling model for both systems. Experiments show that our scaling model can accurately predict the performance of Vote and Filter-Vote systems and thus find the optimal number of LM calls to make. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02419v2-abstract-full').style.display = 'none'; document.getElementById('2403.02419v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Stoica%2C+I&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository