CINXE.COM

<!DOCTYPE html> <html lang="en"> <head> <title>Computation and Language </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a>  <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.CL/recent">cs.CL</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Computation and Language</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item39">Cross-lists</a></li> <li><a href="#item57">Replacements</a></li> </ul> <p>See <a id="recent-cs.CL" aria-labelledby="recent-cs.CL" href="/list/cs.CL/recent">recent</a> articles</p> <h3>Showing new listings for Friday, 4 April 2025</h3> <div class='paging'>Total of 102 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CL/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 38 of 38 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2504.02064" title="Abstract" id="2504.02064"> arXiv:2504.02064 </a> [<a href="/pdf/2504.02064" title="Download PDF" id="pdf-2504.02064" aria-labelledby="pdf-2504.02064">pdf</a>, <a href="https://arxiv.org/html/2504.02064v1" title="View HTML" id="html-2504.02064" aria-labelledby="html-2504.02064" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02064" title="Other formats" id="oth-2504.02064" aria-labelledby="oth-2504.02064">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Text to Graph: Leveraging Graph Neural Networks for Enhanced Explainability in NLP </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Y%C3%A1%C3%B1ez-Romero,+F">Fabio Y谩帽ez-Romero</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Montoyo,+A">Andr茅s Montoyo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su%C3%A1rez,+A">Armando Su谩rez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guti%C3%A9rrez,+Y">Yoan Guti茅rrez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mitkov,+R">Ruslan Mitkov</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Researchers have relegated natural language processing tasks to Transformer-type models, particularly generative models, because these models exhibit high versatility when performing generation and classification tasks. As the size of these models increases, they achieve outstanding results. Given their widespread use, many explainability techniques are developed based on these models. However, this process becomes computationally expensive due to the large size of the models. Additionally, transformers interpret input information through tokens that fragment input words into sequences lacking inherent semantic meaning, complicating the explanation of the model from the very beginning. This study proposes a novel methodology to achieve explainability in natural language processing tasks by automatically converting sentences into graphs and maintaining semantics through nodes and relations that express fundamental linguistic concepts. It also allows the subsequent exploitation of this knowledge in subsequent tasks, making it possible to obtain trends and understand how the model associates the different elements inside the text with the explained task. The experiments delivered promising results in determining the most critical components within the text structure for a given classification. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2504.02091" title="Abstract" id="2504.02091"> arXiv:2504.02091 </a> [<a href="/pdf/2504.02091" title="Download PDF" id="pdf-2504.02091" aria-labelledby="pdf-2504.02091">pdf</a>, <a href="/format/2504.02091" title="Other formats" id="oth-2504.02091" aria-labelledby="oth-2504.02091">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Increasing happiness through conversations with artificial intelligence </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Heffner,+J">Joseph Heffner</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+C">Chongyu Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chadwick,+M">Martin Chadwick</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Knutsen,+C">Chris Knutsen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Summerfield,+C">Christopher Summerfield</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kurth-Nelson,+Z">Zeb Kurth-Nelson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rutledge,+R+B">Robb B. Rutledge</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Chatbots powered by artificial intelligence (AI) have rapidly become a significant part of everyday life, with over a quarter of American adults using them multiple times per week. While these tools offer potential benefits and risks, a fundamental question remains largely unexplored: How do conversations with AI influence subjective well-being? To investigate this, we conducted a study where participants either engaged in conversations with an AI chatbot (N = 334) or wrote journal entires (N = 193) on the same randomly assigned topics and reported their momentary happiness afterward. We found that happiness after AI chatbot conversations was higher than after journaling, particularly when discussing negative topics such as depression or guilt. Leveraging large language models for sentiment analysis, we found that the AI chatbot mirrored participants' sentiment while maintaining a consistent positivity bias. When discussing negative topics, participants gradually aligned their sentiment with the AI's positivity, leading to an overall increase in happiness. We hypothesized that the history of participants' sentiment prediction errors, the difference between expected and actual emotional tone when responding to the AI chatbot, might explain this happiness effect. Using computational modeling, we find the history of these sentiment prediction errors over the course of a conversation predicts greater post-conversation happiness, demonstrating a central role of emotional expectations during dialogue. Our findings underscore the effect that AI interactions can have on human well-being. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2504.02106" title="Abstract" id="2504.02106"> arXiv:2504.02106 </a> [<a href="/pdf/2504.02106" title="Download PDF" id="pdf-2504.02106" aria-labelledby="pdf-2504.02106">pdf</a>, <a href="https://arxiv.org/html/2504.02106v1" title="View HTML" id="html-2504.02106" aria-labelledby="html-2504.02106" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02106" title="Other formats" id="oth-2504.02106" aria-labelledby="oth-2504.02106">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ContrastScore: Towards Higher Quality, Less Biased, More Efficient Evaluation Metrics with Contrastive Evaluation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Larionov,+D">Daniil Larionov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Siwei Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yiqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Eger,+S">Steffen Eger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Moosavi,+N+S">Nafise Sadat Moosavi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+C">Chenghua Lin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Evaluating the quality of generated text automatically remains a significant challenge. Conventional reference-based metrics have been shown to exhibit relatively weak correlation with human evaluations. Recent research advocates the use of large language models (LLMs) as source-based metrics for natural language generation (NLG) assessment. While promising, LLM-based metrics, particularly those using smaller models, still fall short in aligning with human judgments. In this work, we introduce ContrastScore, a contrastive evaluation metric designed to enable higher-quality, less biased, and more efficient assessment of generated text. We evaluate ContrastScore on two NLG tasks: machine translation and summarization. Experimental results show that ContrastScore consistently achieves stronger correlation with human judgments than both single-model and ensemble-based baselines. Notably, ContrastScore based on Qwen 3B and 0.5B even outperforms Qwen 7B, despite having only half as many parameters, demonstrating its efficiency. Furthermore, it effectively mitigates common evaluation biases such as length and likelihood preferences, resulting in more robust automatic evaluation. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2504.02116" title="Abstract" id="2504.02116"> arXiv:2504.02116 </a> [<a href="/pdf/2504.02116" title="Download PDF" id="pdf-2504.02116" aria-labelledby="pdf-2504.02116">pdf</a>, <a href="https://arxiv.org/html/2504.02116v1" title="View HTML" id="html-2504.02116" aria-labelledby="html-2504.02116" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02116" title="Other formats" id="oth-2504.02116" aria-labelledby="oth-2504.02116">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Models at the Syntax-Semantics Interface: A Case Study of the Long-Distance Binding of Chinese Reflexive ziji </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xiulin Yang</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> COLING 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper explores whether language models can effectively resolve the complex binding patterns of the Mandarin Chinese reflexive ziji, which are constrained by both syntactic and semantic factors. We construct a dataset of 240 synthetic sentences using templates and examples from syntactic literature, along with 320 natural sentences from the BCC corpus. Evaluating 21 language models against this dataset and comparing their performance to judgments from native Mandarin speakers, we find that none of the models consistently replicates human-like judgments. The results indicate that existing language models tend to rely heavily on sequential cues, though not always favoring the closest strings, and often overlooking subtle semantic and syntactic constraints. They tend to be more sensitive to noun-related than verb-related semantics. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2504.02122" title="Abstract" id="2504.02122"> arXiv:2504.02122 </a> [<a href="/pdf/2504.02122" title="Download PDF" id="pdf-2504.02122" aria-labelledby="pdf-2504.02122">pdf</a>, <a href="https://arxiv.org/html/2504.02122v1" title="View HTML" id="html-2504.02122" aria-labelledby="html-2504.02122" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02122" title="Other formats" id="oth-2504.02122" aria-labelledby="oth-2504.02122">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Overcoming Vocabulary Constraints with Pixel-level Fallback </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lotz,+J+F">Jonas F. Lotz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Setiawan,+H">Hendra Setiawan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peitz,+S">Stephan Peitz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kementchedjhieva,+Y">Yova Kementchedjhieva</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Subword tokenization requires balancing computational efficiency and vocabulary coverage, which often leads to suboptimal performance on languages and scripts not prioritized during training. We propose to augment pretrained language models with a vocabulary-free encoder that generates input embeddings from text rendered as pixels. Through experiments on English-centric language models, we demonstrate that our approach substantially improves machine translation performance and facilitates effective cross-lingual transfer, outperforming tokenizer-based methods. Furthermore, we find that pixel-based representations outperform byte-level approaches and standard vocabulary expansion. Our approach enhances the multilingual capabilities of monolingual language models without extensive retraining and reduces decoding latency via input compression. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2504.02132" title="Abstract" id="2504.02132"> arXiv:2504.02132 </a> [<a href="/pdf/2504.02132" title="Download PDF" id="pdf-2504.02132" aria-labelledby="pdf-2504.02132">pdf</a>, <a href="https://arxiv.org/html/2504.02132v1" title="View HTML" id="html-2504.02132" aria-labelledby="html-2504.02132" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02132" title="Other formats" id="oth-2504.02132" aria-labelledby="oth-2504.02132">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> One Pic is All it Takes: Poisoning Visual Document Retrieval Augmented Generation with a Single Image </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shereen,+E">Ezzeldin Shereen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ristea,+D">Dan Ristea</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hasircioglu,+B">Burak Hasircioglu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McFadden,+S">Shae McFadden</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mavroudis,+V">Vasilios Mavroudis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hicks,+C">Chris Hicks</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Cryptography and Security (cs.CR); Computer Vision and Pattern Recognition (cs.CV); Information Retrieval (cs.IR) </div> <p class='mathjax'> Multimodal retrieval augmented generation (M-RAG) has recently emerged as a method to inhibit hallucinations of large multimodal models (LMMs) through a factual knowledge base (KB). However, M-RAG also introduces new attack vectors for adversaries that aim to disrupt the system by injecting malicious entries into the KB. In this work, we present a poisoning attack against M-RAG targeting visual document retrieval applications, where the KB contains images of document pages. Our objective is to craft a single image that is retrieved for a variety of different user queries, and consistently influences the output produced by the generative model, thus creating a universal denial-of-service (DoS) attack against the M-RAG system. We demonstrate that while our attack is effective against a diverse range of widely-used, state-of-the-art retrievers (embedding models) and generators (LMMs), it can also be ineffective against robust embedding models. Our attack not only highlights the vulnerability of M-RAG pipelines to poisoning attacks, but also sheds light on a fundamental weakness that potentially hinders their performance even in benign settings. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2504.02146" title="Abstract" id="2504.02146"> arXiv:2504.02146 </a> [<a href="/pdf/2504.02146" title="Download PDF" id="pdf-2504.02146" aria-labelledby="pdf-2504.02146">pdf</a>, <a href="https://arxiv.org/html/2504.02146v1" title="View HTML" id="html-2504.02146" aria-labelledby="html-2504.02146" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02146" title="Other formats" id="oth-2504.02146" aria-labelledby="oth-2504.02146">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LL4G: Self-Supervised Dynamic Optimization for Graph-Based Personality Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+L">Lingzhi Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Long,+Y">Yunfei Long</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+X">Xiaohao Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+G">Guanming Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuhan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Razzak,+I">Imran Razzak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jameel,+S">Shoaib Jameel</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Graph-based personality detection constructs graph structures from textual data, particularly social media posts. Current methods often struggle with sparse or noisy data and rely on static graphs, limiting their ability to capture dynamic changes between nodes and relationships. This paper introduces LL4G, a self-supervised framework leveraging large language models (LLMs) to optimize graph neural networks (GNNs). LLMs extract rich semantic features to generate node representations and to infer explicit and implicit relationships. The graph structure adaptively adds nodes and edges based on input data, continuously optimizing itself. The GNN then uses these optimized representations for joint training on node reconstruction, edge prediction, and contrastive learning tasks. This integration of semantic and structural information generates robust personality profiles. Experimental results on Kaggle and Pandora datasets show LL4G outperforms state-of-the-art models. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2504.02178" title="Abstract" id="2504.02178"> arXiv:2504.02178 </a> [<a href="/pdf/2504.02178" title="Download PDF" id="pdf-2504.02178" aria-labelledby="pdf-2504.02178">pdf</a>, <a href="/format/2504.02178" title="Other formats" id="oth-2504.02178" aria-labelledby="oth-2504.02178">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Subasa -- Adapting Language Models for Low-resourced Offensive Language Detection in Sinhala </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Haturusinghe,+S">Shanilka Haturusinghe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weerasooriya,+T+C">Tharindu Cyril Weerasooriya</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zampieri,+M">Marcos Zampieri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Homan,+C+M">Christopher M. Homan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liyanage,+S">S.R. Liyanage</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to appear at NAACL SRW 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Accurate detection of offensive language is essential for a number of applications related to social media safety. There is a sharp contrast in performance in this task between low and high-resource languages. In this paper, we adapt fine-tuning strategies that have not been previously explored for Sinhala in the downstream task of offensive language detection. Using this approach, we introduce four models: "Subasa-XLM-R", which incorporates an intermediate Pre-Finetuning step using Masked Rationale Prediction. Two variants of "Subasa-Llama" and "Subasa-Mistral", are fine-tuned versions of Llama (3.2) and Mistral (v0.3), respectively, with a task-specific strategy. We evaluate our models on the SOLD benchmark dataset for Sinhala offensive language detection. All our models outperform existing baselines. Subasa-XLM-R achieves the highest Macro F1 score (0.84) surpassing state-of-the-art large language models like GPT-4o when evaluated on the same SOLD benchmark dataset under zero-shot settings. The models and code are publicly available. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2504.02254" title="Abstract" id="2504.02254"> arXiv:2504.02254 </a> [<a href="/pdf/2504.02254" title="Download PDF" id="pdf-2504.02254" aria-labelledby="pdf-2504.02254">pdf</a>, <a href="/format/2504.02254" title="Other formats" id="oth-2504.02254" aria-labelledby="oth-2504.02254">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs as Deceptive Agents: How Role-Based Prompting Induces Semantic Ambiguity in Puzzle Tasks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yoo,+S">Seunghyun Yoo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 5 figures, 1 table </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advancements in Large Language Models (LLMs) have not only showcased impressive creative capabilities but also revealed emerging agentic behaviors that exploit linguistic ambiguity in adversarial settings. In this study, we investigate how an LLM, acting as an autonomous agent, leverages semantic ambiguity to generate deceptive puzzles that mislead and challenge human users. Inspired by the popular puzzle game "Connections", we systematically compare puzzles produced through zero-shot prompting, role-injected adversarial prompts, and human-crafted examples, with an emphasis on understanding the underlying agent decision-making processes. Employing computational analyses with HateBERT to quantify semantic ambiguity, alongside subjective human evaluations, we demonstrate that explicit adversarial agent behaviors significantly heighten semantic ambiguity -- thereby increasing cognitive load and reducing fairness in puzzle solving. These findings provide critical insights into the emergent agentic qualities of LLMs and underscore important ethical considerations for evaluating and safely deploying autonomous language systems in both educational technologies and entertainment. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2504.02293" title="Abstract" id="2504.02293"> arXiv:2504.02293 </a> [<a href="/pdf/2504.02293" title="Download PDF" id="pdf-2504.02293" aria-labelledby="pdf-2504.02293">pdf</a>, <a href="https://arxiv.org/html/2504.02293v1" title="View HTML" id="html-2504.02293" aria-labelledby="html-2504.02293" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02293" title="Other formats" id="oth-2504.02293" aria-labelledby="oth-2504.02293">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> State-of-the-Art Translation of Text-to-Gloss using mBART : A case study of Bangla </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Abdullah,+S+M">Sharif Md. Abdullah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Paul,+A">Abhijit Paul</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rayana,+S">Shebuti Rayana</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kabir,+A">Ahmedul Kabir</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Masud,+Z">Zarif Masud</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Initial Version </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Despite a large deaf and dumb population of 1.7 million, Bangla Sign Language (BdSL) remains a understudied domain. Specifically, there are no works on Bangla text-to-gloss translation task. To address this gap, we begin by addressing the dataset problem. We take inspiration from grammatical rule based gloss generation used in Germany and American sign langauage (ASL) and adapt it for BdSL. We also leverage LLM to generate synthetic data and use back-translation, text generation for data augmentation. With dataset prepared, we started experimentation. We fine-tuned pretrained mBART-50 and mBERT-multiclass-uncased model on our dataset. We also trained GRU, RNN and a novel seq-to-seq model with multi-head attention. We observe significant high performance (ScareBLEU=79.53) with fine-tuning pretrained mBART-50 multilingual model from Facebook. We then explored why we observe such high performance with mBART. We soon notice an interesting property of mBART -- it was trained on shuffled and masked text data. And as we know, gloss form has shuffling property. So we hypothesize that mBART is inherently good at text-to-gloss tasks. To find support against this hypothesis, we trained mBART-50 on PHOENIX-14T benchmark and evaluated it with existing literature. Our mBART-50 finetune demonstrated State-of-the-Art performance on PHOENIX-14T benchmark, far outperforming existing models in all 6 metrics (ScareBLEU = 63.89, BLEU-1 = 55.14, BLEU-2 = 38.07, BLEU-3 = 27.13, BLEU-4 = 20.68, COMET = 0.624). Based on the results, this study proposes a new paradigm for text-to-gloss task using mBART models. Additionally, our results show that BdSL text-to-gloss task can greatly benefit from rule-based synthetic dataset. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2504.02304" title="Abstract" id="2504.02304"> arXiv:2504.02304 </a> [<a href="/pdf/2504.02304" title="Download PDF" id="pdf-2504.02304" aria-labelledby="pdf-2504.02304">pdf</a>, <a href="/format/2504.02304" title="Other formats" id="oth-2504.02304" aria-labelledby="oth-2504.02304">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Measurement of LLM's Philosophies of Human Nature </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+M">Minheng Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+E">Ennan Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Z">Zidong Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Z">Zhengyuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Linjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+C">Chung-Ching Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+K">Kevin Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Lijuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zuo,+W">Wangmeng Zuo</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The widespread application of artificial intelligence (AI) in various tasks, along with frequent reports of conflicts or violations involving AI, has sparked societal concerns about interactions with AI systems. Based on Wrightsman's Philosophies of Human Nature Scale (PHNS), a scale empirically validated over decades to effectively assess individuals' attitudes toward human nature, we design the standardized psychological scale specifically targeting large language models (LLM), named the Machine-based Philosophies of Human Nature Scale (M-PHNS). By evaluating LLMs' attitudes toward human nature across six dimensions, we reveal that current LLMs exhibit a systemic lack of trust in humans, and there is a significant negative correlation between the model's intelligence level and its trust in humans. Furthermore, we propose a mental loop learning framework, which enables LLM to continuously optimize its value system during virtual interactions by constructing moral scenarios, thereby improving its attitude toward human nature. Experiments demonstrate that mental loop learning significantly enhances their trust in humans compared to persona or instruction prompts. This finding highlights the potential of human-based psychological assessments for LLM, which can not only diagnose cognitive biases but also provide a potential solution for ethical learning in artificial intelligence. We release the M-PHNS evaluation code and data at <a href="https://github.com/kodenii/M-PHNS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2504.02310" title="Abstract" id="2504.02310"> arXiv:2504.02310 </a> [<a href="/pdf/2504.02310" title="Download PDF" id="pdf-2504.02310" aria-labelledby="pdf-2504.02310">pdf</a>, <a href="/format/2504.02310" title="Other formats" id="oth-2504.02310" aria-labelledby="oth-2504.02310">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Harmful Text Detection with Joint Retrieval and External Knowledge </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+Z">Zidong Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+N">Nan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Weiqiang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+X">Xu Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+J">Junliang Du</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Harmful text detection has become a crucial task in the development and deployment of large language models, especially as AI-generated content continues to expand across digital platforms. This study proposes a joint retrieval framework that integrates pre-trained language models with knowledge graphs to improve the accuracy and robustness of harmful text detection. Experimental results demonstrate that the joint retrieval approach significantly outperforms single-model baselines, particularly in low-resource training scenarios and multilingual environments. The proposed method effectively captures nuanced harmful content by leveraging external contextual information, addressing the limitations of traditional detection models. Future research should focus on optimizing computational efficiency, enhancing model interpretability, and expanding multimodal detection capabilities to better tackle evolving harmful content patterns. This work contributes to the advancement of AI safety, ensuring more trustworthy and reliable content moderation systems. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2504.02323" title="Abstract" id="2504.02323"> arXiv:2504.02323 </a> [<a href="/pdf/2504.02323" title="Download PDF" id="pdf-2504.02323" aria-labelledby="pdf-2504.02323">pdf</a>, <a href="https://arxiv.org/html/2504.02323v1" title="View HTML" id="html-2504.02323" aria-labelledby="html-2504.02323" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02323" title="Other formats" id="oth-2504.02323" aria-labelledby="oth-2504.02323">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CoTAL: Human-in-the-Loop Prompt Engineering, Chain-of-Thought Reasoning, and Active Learning for Generalizable Formative Assessment Scoring </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cohn,+C">Clayton Cohn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hutchins,+N">Nicole Hutchins</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=S,+A+T">Ashwin T S</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Biswas,+G">Gautam Biswas</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to IEEE Transactions on Learning Technologies. Currently under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) have created new opportunities to assist teachers and support student learning. Methods such as chain-of-thought (CoT) prompting enable LLMs to grade formative assessments in science, providing scores and relevant feedback to students. However, the extent to which these methods generalize across curricula in multiple domains (such as science, computing, and engineering) remains largely untested. In this paper, we introduce Chain-of-Thought Prompting + Active Learning (CoTAL), an LLM-based approach to formative assessment scoring that (1) leverages Evidence-Centered Design (ECD) principles to develop curriculum-aligned formative assessments and rubrics, (2) applies human-in-the-loop prompt engineering to automate response scoring, and (3) incorporates teacher and student feedback to iteratively refine assessment questions, grading rubrics, and LLM prompts for automated grading. Our findings demonstrate that CoTAL improves GPT-4's scoring performance, achieving gains of up to 24.5% over a non-prompt-engineered baseline. Both teachers and students view CoTAL as effective in scoring and explaining student responses, each providing valuable refinements to enhance grading accuracy and explanation quality. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2504.02327" title="Abstract" id="2504.02327"> arXiv:2504.02327 </a> [<a href="/pdf/2504.02327" title="Download PDF" id="pdf-2504.02327" aria-labelledby="pdf-2504.02327">pdf</a>, <a href="https://arxiv.org/html/2504.02327v1" title="View HTML" id="html-2504.02327" aria-labelledby="html-2504.02327" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02327" title="Other formats" id="oth-2504.02327" aria-labelledby="oth-2504.02327">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LearNAT: Learning NL2SQL with AST-guided Task Decomposition for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+W">Weibin Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+X">Xin Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+T">Tianyu Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qiu,+R">Rihong Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yifan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chu,+X">Xu Chu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+J">Junfeng Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yasha Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Natural Language to SQL (NL2SQL) has emerged as a critical task for enabling seamless interaction with databases. Recent advancements in Large Language Models (LLMs) have demonstrated remarkable performance in this domain. However, existing NL2SQL methods predominantly rely on closed-source LLMs leveraging prompt engineering, while open-source models typically require fine-tuning to acquire domain-specific knowledge. Despite these efforts, open-source LLMs struggle with complex NL2SQL tasks due to the indirect expression of user query objectives and the semantic gap between user queries and database schemas. Inspired by the application of reinforcement learning in mathematical problem-solving to encourage step-by-step reasoning in LLMs, we propose LearNAT (Learning NL2SQL with AST-guided Task Decomposition), a novel framework that improves the performance of open-source LLMs on complex NL2SQL tasks through task decomposition and reinforcement learning. LearNAT introduces three key components: (1) a Decomposition Synthesis Procedure that leverages Abstract Syntax Trees (ASTs) to guide efficient search and pruning strategies for task decomposition, (2) Margin-aware Reinforcement Learning, which employs fine-grained step-level optimization via DPO with AST margins, and (3) Adaptive Demonstration Reasoning, a mechanism for dynamically selecting relevant examples to enhance decomposition capabilities. Extensive experiments on two benchmark datasets, Spider and BIRD, demonstrate that LearNAT enables a 7B-parameter open-source LLM to achieve performance comparable to GPT-4, while offering improved efficiency and accessibility. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2504.02395" title="Abstract" id="2504.02395"> arXiv:2504.02395 </a> [<a href="/pdf/2504.02395" title="Download PDF" id="pdf-2504.02395" aria-labelledby="pdf-2504.02395">pdf</a>, <a href="https://arxiv.org/html/2504.02395v1" title="View HTML" id="html-2504.02395" aria-labelledby="html-2504.02395" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02395" title="Other formats" id="oth-2504.02395" aria-labelledby="oth-2504.02395">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The quasi-semantic competence of LLMs: a case study on the part-whole relation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Proietti,+M">Mattia Proietti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lenci,+A">Alessandro Lenci</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Understanding the extent and depth of the semantic competence of \emph{Large Language Models} (LLMs) is at the center of the current scientific agenda in Artificial Intelligence (AI) and Computational Linguistics (CL). We contribute to this endeavor by investigating their knowledge of the \emph{part-whole} relation, a.k.a. \emph{meronymy}, which plays a crucial role in lexical organization, but it is significantly understudied. We used data from ConceptNet relations \citep{speer2016conceptnet} and human-generated semantic feature norms \citep{McRae:2005} to explore the abilities of LLMs to deal with \textit{part-whole} relations. We employed several methods based on three levels of analysis: i.) \textbf{behavioral} testing via prompting, where we directly queried the models on their knowledge of meronymy, ii.) sentence \textbf{probability} scoring, where we tested models' abilities to discriminate correct (real) and incorrect (asymmetric counterfactual) \textit{part-whole} relations, and iii.) \textbf{concept representation} analysis in vector space, where we proved the linear organization of the \textit{part-whole} concept in the embedding and unembedding spaces. These analyses present a complex picture that reveals that the LLMs' knowledge of this relation is only partial. They have just a ``\emph{quasi}-semantic'' competence and still fall short of capturing deep inferential properties. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2504.02398" title="Abstract" id="2504.02398"> arXiv:2504.02398 </a> [<a href="/pdf/2504.02398" title="Download PDF" id="pdf-2504.02398" aria-labelledby="pdf-2504.02398">pdf</a>, <a href="https://arxiv.org/html/2504.02398v1" title="View HTML" id="html-2504.02398" aria-labelledby="html-2504.02398" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02398" title="Other formats" id="oth-2504.02398" aria-labelledby="oth-2504.02398">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Analysis of Interleaved Speech-Text Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Maimon,+G">Gallil Maimon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hassid,+M">Michael Hassid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roth,+A">Amit Roth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Adi,+Y">Yossi Adi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Existing Speech Language Model (SLM) scaling analysis paints a bleak picture. They predict that SLMs require much more compute and data compared to text, leading some to question the feasibility of training high-quality SLMs. However, modern SLMs are often initialised from pre-trained TextLMs using speech-text interleaving to allow knowledge transfer. This raises the question - Do interleaved SLMs scale more efficiently than textless-SLMs? In this paper we answer a resounding, yes! We conduct scaling analysis of interleaved SLMs by training several dozen and analysing the scaling trends. We see that under this setup SLMs scale more efficiently with compute. Additionally, our results indicate that the scaling-dynamics are significantly different than textless-SLMs, suggesting one should allocate notably more of the compute budget for increasing model size over training tokens. We also study the role of synthetic data and TextLM model families in unlocking this potential. Results suggest, that our scaled up model achieves comparable performance with leading models on speech semantic metrics while using less compute and data than other approaches. We open source models, samples, and data - <a href="https://pages.cs.huji.ac.il/adiyoss-lab/sims" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2504.02403" title="Abstract" id="2504.02403"> arXiv:2504.02403 </a> [<a href="/pdf/2504.02403" title="Download PDF" id="pdf-2504.02403" aria-labelledby="pdf-2504.02403">pdf</a>, <a href="https://arxiv.org/html/2504.02403v1" title="View HTML" id="html-2504.02403" aria-labelledby="html-2504.02403" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02403" title="Other formats" id="oth-2504.02403" aria-labelledby="oth-2504.02403">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DaKultur: Evaluating the Cultural Awareness of Language Models for Danish with Native Speakers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=M%C3%BCller-Eberstein,+M">Max M眉ller-Eberstein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+M">Mike Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bassignana,+E">Elisa Bassignana</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Trolle,+P+B">Peter Brunsgaard Trolle</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+der+Goot,+R">Rob van der Goot</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at C3NLP at NAACL </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Large Language Models (LLMs) have seen widespread societal adoption. However, while they are able to interact with users in languages beyond English, they have been shown to lack cultural awareness, providing anglocentric or inappropriate responses for underrepresented language communities. To investigate this gap and disentangle linguistic versus cultural proficiency, we conduct the first cultural evaluation study for the mid-resource language of Danish, in which native speakers prompt different models to solve tasks requiring cultural awareness. Our analysis of the resulting 1,038 interactions from 63 demographically diverse participants highlights open challenges to cultural adaptation: Particularly, how currently employed automatically translated data are insufficient to train or measure cultural adaptation, and how training on native-speaker data can more than double response acceptance rates. We release our study data as DaKultur - the first native Danish cultural awareness dataset. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2504.02404" title="Abstract" id="2504.02404"> arXiv:2504.02404 </a> [<a href="/pdf/2504.02404" title="Download PDF" id="pdf-2504.02404" aria-labelledby="pdf-2504.02404">pdf</a>, <a href="https://arxiv.org/html/2504.02404v1" title="View HTML" id="html-2504.02404" aria-labelledby="html-2504.02404" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02404" title="Other formats" id="oth-2504.02404" aria-labelledby="oth-2504.02404">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AnesBench: Multi-Dimensional Evaluation of LLM Reasoning in Anesthesiology </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+X">Xiang Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+W">Wentao Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zengmao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+Y">Yong Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+P">Pingbo Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+B">Baosheng Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+H">Hua Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Du,+B">Bo Du</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jing Zhang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The application of large language models (LLMs) in the medical field has gained significant attention, yet their reasoning capabilities in more specialized domains like anesthesiology remain underexplored. In this paper, we systematically evaluate the reasoning capabilities of LLMs in anesthesiology and analyze key factors influencing their performance. To this end, we introduce AnesBench, a cross-lingual benchmark designed to assess anesthesiology-related reasoning across three levels: factual retrieval (System 1), hybrid reasoning (System 1.x), and complex decision-making (System 2). Through extensive experiments, we first explore how model characteristics, including model scale, Chain of Thought (CoT) length, and language transferability, affect reasoning performance. Then, we further evaluate the effectiveness of different training strategies, leveraging our curated anesthesiology-related dataset, including continuous pre-training (CPT) and supervised fine-tuning (SFT). Additionally, we also investigate how the test-time reasoning techniques, such as Best-of-N sampling and beam search, influence reasoning performance, and assess the impact of reasoning-enhanced model distillation, specifically DeepSeek-R1. We will publicly release AnesBench, along with our CPT and SFT training datasets and evaluation code at <a href="https://github.com/MiliLab/AnesBench" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2504.02411" title="Abstract" id="2504.02411"> arXiv:2504.02411 </a> [<a href="/pdf/2504.02411" title="Download PDF" id="pdf-2504.02411" aria-labelledby="pdf-2504.02411">pdf</a>, <a href="https://arxiv.org/html/2504.02411v1" title="View HTML" id="html-2504.02411" aria-labelledby="html-2504.02411" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02411" title="Other formats" id="oth-2504.02411" aria-labelledby="oth-2504.02411">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adapting Large Language Models for Multi-Domain Retrieval-Augmented-Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Misrahi,+A">Alexandre Misrahi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chirkova,+N">Nadezhda Chirkova</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Louis,+M">Maxime Louis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nikoulina,+V">Vassilina Nikoulina</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 25 pages, 8 figures, 21 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-Augmented Generation (RAG) enhances LLM factuality, but multi-domain applications face challenges like lack of diverse benchmarks and poor out-of-domain generalization. The first contribution of this work is to introduce a diverse benchmark comprising a variety of question-answering tasks from 8 sources and covering 13 domains. Our second contribution consists in systematically testing out-of-domain generalization for typical RAG tuning strategies. While our findings reveal that standard fine-tuning fails to generalize effectively, we show that sequence-level distillation with teacher-generated labels improves out-of-domain performance by providing more coherent supervision. Our findings highlight key strategies for improving multi-domain RAG robustness. </p> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2504.02438" title="Abstract" id="2504.02438"> arXiv:2504.02438 </a> [<a href="/pdf/2504.02438" title="Download PDF" id="pdf-2504.02438" aria-labelledby="pdf-2504.02438">pdf</a>, <a href="/format/2504.02438" title="Other formats" id="oth-2504.02438" aria-labelledby="oth-2504.02438">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Video-Language Models to 10K Frames via Hierarchical Differential Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+C">Chuanqi Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+J">Jian Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+W">Wei Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+R">Rui Yan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Long-form video processing fundamentally challenges vision-language models (VLMs) due to the high computational costs of handling extended temporal sequences. Existing token pruning and feature merging methods often sacrifice critical temporal dependencies or dilute semantic information. We introduce differential distillation, a principled approach that systematically preserves task-relevant information while suppressing redundancy. Based on this principle, we develop ViLaMP, a hierarchical video-language model that processes hour-long videos at ``mixed precision'' through two key mechanisms: (1) differential keyframe selection that maximizes query relevance while maintaining temporal distinctiveness at the frame level and (2) differential feature merging that preserves query-salient features in non-keyframes at the patch level. Hence, ViLaMP retains full information in keyframes while reducing non-keyframes to their most salient features, resembling mixed-precision training. Extensive experiments demonstrate ViLaMP's superior performance across four video understanding benchmarks, particularly on long-form content. Notably, ViLaMP can process ultra-long videos (up to 10K frames) on a single NVIDIA A100 GPU, achieving substantial computational efficiency while maintaining state-of-the-art performance. </p> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2504.02441" title="Abstract" id="2504.02441"> arXiv:2504.02441 </a> [<a href="/pdf/2504.02441" title="Download PDF" id="pdf-2504.02441" aria-labelledby="pdf-2504.02441">pdf</a>, <a href="https://arxiv.org/html/2504.02441v1" title="View HTML" id="html-2504.02441" aria-labelledby="html-2504.02441" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02441" title="Other formats" id="oth-2504.02441" aria-labelledby="oth-2504.02441">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cognitive Memory in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+L">Lianlei Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+S">Shixian Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Z">Zezhou Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Y">Yu Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yong Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 37 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This paper examines memory mechanisms in Large Language Models (LLMs), emphasizing their importance for context-rich responses, reduced hallucinations, and improved efficiency. It categorizes memory into sensory, short-term, and long-term, with sensory memory corresponding to input prompts, short-term memory processing immediate context, and long-term memory implemented via external databases or structures. The text-based memory section covers acquisition (selection and summarization), management (updating, accessing, storing, and resolving conflicts), and utilization (full-text search, SQL queries, semantic search). The KV cache-based memory section discusses selection methods (regularity-based summarization, score-based approaches, special token embeddings) and compression techniques (low-rank compression, KV merging, multimodal compression), along with management strategies like offloading and shared attention mechanisms. Parameter-based memory methods (LoRA, TTT, MoE) transform memories into model parameters to enhance efficiency, while hidden-state-based memory approaches (chunk mechanisms, recurrent transformers, Mamba model) improve long-text processing by combining RNN hidden states with current methods. Overall, the paper offers a comprehensive analysis of LLM memory mechanisms, highlighting their significance and future research directions. </p> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2504.02495" title="Abstract" id="2504.02495"> arXiv:2504.02495 </a> [<a href="/pdf/2504.02495" title="Download PDF" id="pdf-2504.02495" aria-labelledby="pdf-2504.02495">pdf</a>, <a href="/format/2504.02495" title="Other formats" id="oth-2504.02495" aria-labelledby="oth-2504.02495">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Inference-Time Scaling for Generalist Reward Modeling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zijun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Peiyi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Runxin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+S">Shirong Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ruan,+C">Chong Ruan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+P">Peng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Y">Yang Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">Yu Wu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint, under review. 42 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Reinforcement learning (RL) has been widely adopted in post-training for large language models (LLMs) at scale. Recently, the incentivization of reasoning capabilities in LLMs from RL indicates that $\textit{proper learning methods could enable effective inference-time scalability}$. A key challenge of RL is to obtain accurate reward signals for LLMs in various domains beyond verifiable questions or artificial rules. In this work, we investigate how to improve reward modeling (RM) with more inference compute for general queries, i.e. the $\textbf{inference-time scalability of generalist RM}$, and further, how to improve the effectiveness of performance-compute scaling with proper learning methods. For the RM approach, we adopt pointwise generative reward modeling (GRM) to enable flexibility for different input types and potential for inference-time scaling. For the learning method, we propose Self-Principled Critique Tuning (SPCT) to foster scalable reward generation behaviors in GRMs through online RL, to generate principles adaptively and critiques accurately, resulting in $\textbf{DeepSeek-GRM}$ models. Furthermore, for effective inference-time scaling, we use parallel sampling to expand compute usage, and introduce a meta RM to guide voting process for better scaling performance. Empirically, we show that SPCT significantly improves the quality and scalability of GRMs, outperforming existing methods and models in various RM benchmarks without severe biases, and could achieve better performance compared to training-time scaling. DeepSeek-GRM still meets challenges in some tasks, which we believe can be addressed by future efforts in generalist reward systems. The models will be released and open-sourced. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2504.02521" title="Abstract" id="2504.02521"> arXiv:2504.02521 </a> [<a href="/pdf/2504.02521" title="Download PDF" id="pdf-2504.02521" aria-labelledby="pdf-2504.02521">pdf</a>, <a href="https://arxiv.org/html/2504.02521v1" title="View HTML" id="html-2504.02521" aria-labelledby="html-2504.02521" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02521" title="Other formats" id="oth-2504.02521" aria-labelledby="oth-2504.02521">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UNDO: Understanding Distillation as Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jain,+K">Kushal Jain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Goyal,+P">Piyushi Goyal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shridhar,+K">Kumar Shridhar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Knowledge distillation has emerged as an effective strategy for compressing large language models' (LLMs) knowledge into smaller, more efficient student models. However, standard one-shot distillation methods often produce suboptimal results due to a mismatch between teacher-generated rationales and the student's specific learning requirements. In this paper, we introduce the UNDO: UNderstanding Distillation as Optimization framework, designed to bridge this gap by iteratively identifying the student's errors and prompting the teacher to refine its explanations accordingly. Each iteration directly targets the student's learning deficiencies, motivating the teacher to provide tailored and enhanced rationales that specifically address these weaknesses. Empirical evaluations on various challenging mathematical and commonsense reasoning tasks demonstrate that our iterative distillation method, UNDO, significantly outperforms standard one-step distillation methods, achieving performance gains of up to 20%. Additionally, we show that teacher-generated data refined through our iterative process remains effective even when applied to different student models, underscoring the broad applicability of our approach. Our work fundamentally reframes knowledge distillation as an iterative teacher-student interaction, effectively leveraging dynamic refinement by the teacher for better knowledge distillation. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2504.02559" title="Abstract" id="2504.02559"> arXiv:2504.02559 </a> [<a href="/pdf/2504.02559" title="Download PDF" id="pdf-2504.02559" aria-labelledby="pdf-2504.02559">pdf</a>, <a href="https://arxiv.org/html/2504.02559v1" title="View HTML" id="html-2504.02559" aria-labelledby="html-2504.02559" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02559" title="Other formats" id="oth-2504.02559" aria-labelledby="oth-2504.02559">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging LLM For Synchronizing Information Across Multilingual Tables </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Khincha,+S">Siddharth Khincha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kataria,+T">Tushar Kataria</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anand,+A">Ankita Anand</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roth,+D">Dan Roth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gupta,+V">Vivek Gupta</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 17 Pages, 11 Tables, 2 Figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The vast amount of online information today poses challenges for non-English speakers, as much of it is concentrated in high-resource languages such as English and French. Wikipedia reflects this imbalance, with content in low-resource languages frequently outdated or incomplete. Recent research has sought to improve cross-language synchronization of Wikipedia tables using rule-based methods. These approaches can be effective, but they struggle with complexity and generalization. This paper explores large language models (LLMs) for multilingual information synchronization, using zero-shot prompting as a scalable solution. We introduce the Information Updation dataset, simulating the real-world process of updating outdated Wikipedia tables, and evaluate LLM performance. Our findings reveal that single-prompt approaches often produce suboptimal results, prompting us to introduce a task decomposition strategy that enhances coherence and accuracy. Our proposed method outperforms existing baselines, particularly in Information Updation (1.79%) and Information Addition (20.58%), highlighting the model strength in dynamically updating and enriching data across architectures </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2504.02572" title="Abstract" id="2504.02572"> arXiv:2504.02572 </a> [<a href="/pdf/2504.02572" title="Download PDF" id="pdf-2504.02572" aria-labelledby="pdf-2504.02572">pdf</a>, <a href="/format/2504.02572" title="Other formats" id="oth-2504.02572" aria-labelledby="oth-2504.02572">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Language Models reach higher Agreement than Humans in Historical Interpretation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Celli,+F">Fabio Celli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Spathulas,+G">Georgios Spathulas</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> This paper compares historical annotations by humans and Large Language Models. The findings reveal that both exhibit some cultural bias, but Large Language Models achieve a higher consensus on the interpretation of historical facts from short texts. While humans tend to disagree on the basis of their personal biases, Large Models disagree when they skip information or produce hallucinations. These findings have significant implications for digital humanities, enabling large-scale annotation and quantitative analysis of historical data. This offers new educational and research opportunities to explore historical interpretations from different Language Models, fostering critical thinking about bias. </p> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2504.02590" title="Abstract" id="2504.02590"> arXiv:2504.02590 </a> [<a href="/pdf/2504.02590" title="Download PDF" id="pdf-2504.02590" aria-labelledby="pdf-2504.02590">pdf</a>, <a href="https://arxiv.org/html/2504.02590v1" title="View HTML" id="html-2504.02590" aria-labelledby="html-2504.02590" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02590" title="Other formats" id="oth-2504.02590" aria-labelledby="oth-2504.02590">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LexPam: Legal Procedure Awareness-Guided Mathematical Reasoning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+K">Kepu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+G">Guofu Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+W">Weijie Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+M">Mingyue Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+X">Xu Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yaxin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+J">Jun Xu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The legal mathematical reasoning ability of LLMs is crucial when applying them to real-world scenarios, as it directly affects the credibility of the LLM. While existing legal LLMs can perform general judicial question answering, their legal mathematical reasoning capabilities have not been trained. Open-domain reasoning models, though able to generate detailed calculation steps, do not follow the reasoning logic required for legal scenarios. Additionally, there is currently a lack of legal mathematical reasoning datasets to help validate and enhance LLMs' reasoning abilities in legal contexts. To address these issues, we propose the first Chinese legal Mathematical Reasoning Dataset, LexNum, which includes three common legal mathematical reasoning scenarios: economic compensation, work injury compensation, and traffic accident compensation. Based on LexNum, we tested the performance of existing legal LLMs and reasoning LLMs, and introduced LexPam, a reinforcement learning algorithm guided by legal procedural awareness to train LLMs, enhancing their mathematical reasoning abilities in legal scenarios. Experiments on tasks in the three legal scenarios show that the performance of existing legal LLMs and reasoning models in legal mathematical reasoning tasks is unsatisfactory. LexPam can enhance the LLM's ability in these tasks. </p> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2504.02604" title="Abstract" id="2504.02604"> arXiv:2504.02604 </a> [<a href="/pdf/2504.02604" title="Download PDF" id="pdf-2504.02604" aria-labelledby="pdf-2504.02604">pdf</a>, <a href="https://arxiv.org/html/2504.02604v1" title="View HTML" id="html-2504.02604" aria-labelledby="html-2504.02604" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02604" title="Other formats" id="oth-2504.02604" aria-labelledby="oth-2504.02604">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LinTO Audio and Textual Datasets to Train and Evaluate Automatic Speech Recognition in Tunisian Arabic Dialect </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Naouara,+H">Hedi Naouara</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lorr%C3%A9,+J">Jean-Pierre Lorr茅</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Louradour,+J">J茅r么me Louradour</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Developing Automatic Speech Recognition (ASR) systems for Tunisian Arabic Dialect is challenging due to the dialect's linguistic complexity and the scarcity of annotated speech datasets. To address these challenges, we propose the LinTO audio and textual datasets -- comprehensive resources that capture phonological and lexical features of Tunisian Arabic Dialect. These datasets include a variety of texts from numerous sources and real-world audio samples featuring diverse speakers and code-switching between Tunisian Arabic Dialect and English or French. By providing high-quality audio paired with precise transcriptions, the LinTO audio and textual datasets aim to provide qualitative material to build and benchmark ASR systems for the Tunisian Arabic Dialect. <br>Keywords -- Tunisian Arabic Dialect, Speech-to-Text, Low-Resource Languages, Audio Data Augmentation </p> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2504.02671" title="Abstract" id="2504.02671"> arXiv:2504.02671 </a> [<a href="/pdf/2504.02671" title="Download PDF" id="pdf-2504.02671" aria-labelledby="pdf-2504.02671">pdf</a>, <a href="https://arxiv.org/html/2504.02671v1" title="View HTML" id="html-2504.02671" aria-labelledby="html-2504.02671" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02671" title="Other formats" id="oth-2504.02671" aria-labelledby="oth-2504.02671">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLM for Complex Reasoning Task: An Exploratory Study in Fermi Problems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zishuo Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Villarreal,+C+R">Carlos Rabat Villarreal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rahgouy,+M">Mostafa Rahgouy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Das,+A">Amit Das</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zheng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+C">Chang Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+D">Dongji Feng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages,7 tables, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Fermi Problems (FPs) are mathematical reasoning tasks that require human-like logic and numerical reasoning. Unlike other reasoning questions, FPs often involve real-world impracticalities or ambiguous concepts, making them challenging even for humans to solve. Despite advancements in AI, particularly with large language models (LLMs) in various reasoning tasks, FPs remain relatively under-explored. This work conducted an exploratory study to examine the capabilities and limitations of LLMs in solving FPs. We first evaluated the overall performance of three advanced LLMs using a publicly available FP dataset. We designed prompts according to the recently proposed TELeR taxonomy, including a zero-shot scenario. Results indicated that all three LLMs achieved a fp_score (range between 0 - 1) below 0.5, underscoring the inherent difficulty of these reasoning tasks. To further investigate, we categorized FPs into standard and specific questions, hypothesizing that LLMs would perform better on standard questions, which are characterized by clarity and conciseness, than on specific ones. Comparative experiments confirmed this hypothesis, demonstrating that LLMs performed better on standard FPs in terms of both accuracy and efficiency. </p> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2504.02674" title="Abstract" id="2504.02674"> arXiv:2504.02674 </a> [<a href="/pdf/2504.02674" title="Download PDF" id="pdf-2504.02674" aria-labelledby="pdf-2504.02674">pdf</a>, <a href="https://arxiv.org/html/2504.02674v1" title="View HTML" id="html-2504.02674" aria-labelledby="html-2504.02674" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02674" title="Other formats" id="oth-2504.02674" aria-labelledby="oth-2504.02674">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Limitations of Religious Data and the Importance of the Target Domain: Towards Machine Translation for Guinea-Bissau Creole </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rowe,+J">Jacqueline Rowe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gow-Smith,+E">Edward Gow-Smith</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hepple,+M">Mark Hepple</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 5 figures, 7 tables. To be published in Proceedings of the 8th Workshop on Technologies for Machine Translation of Low-Resource Languages (NAACL 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We introduce a new dataset for machine translation of Guinea-Bissau Creole (Kiriol), comprising around 40 thousand parallel sentences to English and Portuguese. This dataset is made up of predominantly religious data (from the Bible and texts from the Jehovah's Witnesses), but also a small amount of general domain data (from a dictionary). This mirrors the typical resource availability of many low resource languages. We train a number of transformer-based models to investigate how to improve domain transfer from religious data to a more general domain. We find that adding even 300 sentences from the target domain when training substantially improves the translation performance, highlighting the importance and need for data collection for low-resource languages, even on a small-scale. We additionally find that Portuguese-to-Kiriol translation models perform better on average than other source and target language pairs, and investigate how this relates to the morphological complexity of the languages involved and the degree of lexical overlap between creoles and lexifiers. Overall, we hope our work will stimulate research into Kiriol and into how machine translation might better support creole languages in general. </p> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2504.02708" title="Abstract" id="2504.02708"> arXiv:2504.02708 </a> [<a href="/pdf/2504.02708" title="Download PDF" id="pdf-2504.02708" aria-labelledby="pdf-2504.02708">pdf</a>, <a href="https://arxiv.org/html/2504.02708v1" title="View HTML" id="html-2504.02708" aria-labelledby="html-2504.02708" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02708" title="Other formats" id="oth-2504.02708" aria-labelledby="oth-2504.02708">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Hidden Space of Safety: Understanding Preference-Tuned LLMs in Multilingual context </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Verma,+N">Nikhil Verma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bharadwaj,+M">Manasa Bharadwaj</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 11 Figures, 2 Tables, currently under review at ACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Alignment tuning has enabled large language models to excel in reasoning, instruction-following, and minimizing harmful generations. However, despite their widespread deployment, these models exhibit a monolingual bias, raising concerns about the effectiveness of alignment across languages. Current alignment methods predominantly focus on English, leaving it unclear how alignment mechanism generalize to multilingual settings. To address this, we conduct a systematic analysis of distributional shifts in the embedding space of LLMs before and after alignment, uncovering its impact on model behavior across diverse languages. We leverage the alignment-induced separation in safety space as a quantitative tool to measure how alignment enforces safety constraints. Our study evaluates seven LLMs using balanced toxicity datasets and parallel text-detoxification benchmarks, revealing substantial disparities in the latent representation space between high-resource and low-resource languages. These findings underscore the need for language-specific fine-tuning to ensure fair, reliable and robust multilingual alignment. Our insights provide a foundation for developing truly safe multilingual LLMs, emphasizing the urgency of addressing alignment gaps in underrepresented languages. </p> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2504.02725" title="Abstract" id="2504.02725"> arXiv:2504.02725 </a> [<a href="/pdf/2504.02725" title="Download PDF" id="pdf-2504.02725" aria-labelledby="pdf-2504.02725">pdf</a>, <a href="/format/2504.02725" title="Other formats" id="oth-2504.02725" aria-labelledby="oth-2504.02725">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ERPO: Advancing Safety Alignment via Ex-Ante Reasoning Preference Optimization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+K">Kehua Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+K">Keyan Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+J">Jing Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+M">Menghan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yuhao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+T">Tong Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xinda Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Q">Qiang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Huajun Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Recent advancements in large language models (LLMs) have accelerated progress toward artificial general intelligence, yet their potential to generate harmful content poses critical safety challenges. Existing alignment methods often struggle to cover diverse safety scenarios and remain vulnerable to adversarial attacks. In this work, we propose Ex-Ante Reasoning Preference Optimization (ERPO), a novel safety alignment framework that equips LLMs with explicit preemptive reasoning through Chain-of-Thought and provides clear evidence for safety judgments by embedding predefined safety rules. Specifically, our approach consists of three stages: first, equipping the model with Ex-Ante reasoning through supervised fine-tuning (SFT) using a constructed reasoning module; second, enhancing safety, usefulness, and efficiency via Direct Preference Optimization (DPO); and third, mitigating inference latency with a length-controlled iterative preference optimization strategy. Experiments on multiple open-source LLMs demonstrate that ERPO significantly enhances safety performance while maintaining response efficiency. </p> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2504.02732" title="Abstract" id="2504.02732"> arXiv:2504.02732 </a> [<a href="/pdf/2504.02732" title="Download PDF" id="pdf-2504.02732" aria-labelledby="pdf-2504.02732">pdf</a>, <a href="https://arxiv.org/html/2504.02732v1" title="View HTML" id="html-2504.02732" aria-labelledby="html-2504.02732" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02732" title="Other formats" id="oth-2504.02732" aria-labelledby="oth-2504.02732">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Why do LLMs attend to the first token? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Barbero,+F">Federico Barbero</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Arroyo,+%C3%81">脕lvaro Arroyo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+X">Xiangming Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Perivolaropoulos,+C">Christos Perivolaropoulos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bronstein,+M">Michael Bronstein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=%C4%87,+P+V">Petar Veli膷kovi 膰</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pascanu,+R">Razvan Pascanu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) tend to attend heavily to the first token in the sequence -- creating a so-called attention sink. Many works have studied this phenomenon in detail, proposing various ways to either leverage or alleviate it. Attention sinks have been connected to quantisation difficulties, security issues, and streaming attention. Yet, while many works have provided conditions in which they occur or not, a critical question remains shallowly answered: Why do LLMs learn such patterns and how are they being used? In this work, we argue theoretically and empirically that this mechanism provides a method for LLMs to avoid over-mixing, connecting this to existing lines of work that study mathematically how information propagates in Transformers. We conduct experiments to validate our theoretical intuitions and show how choices such as context length, depth, and data packing influence the sink behaviour. We hope that this study provides a new practical perspective on why attention sinks are useful in LLMs, leading to a better understanding of the attention patterns that form during training. </p> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2504.02733" title="Abstract" id="2504.02733"> arXiv:2504.02733 </a> [<a href="/pdf/2504.02733" title="Download PDF" id="pdf-2504.02733" aria-labelledby="pdf-2504.02733">pdf</a>, <a href="https://arxiv.org/html/2504.02733v1" title="View HTML" id="html-2504.02733" aria-labelledby="html-2504.02733" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02733" title="Other formats" id="oth-2504.02733" aria-labelledby="oth-2504.02733">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing LLM Robustness to Perturbed Instructions: An Empirical Study </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Agrawal,+A">Aryan Agrawal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alazraki,+L">Lisa Alazraki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Honarvar,+S">Shahin Honarvar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rei,+M">Marek Rei</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Building Trust Workshop, ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) are highly vulnerable to input perturbations, as even a small prompt change may result in a substantially different output. Existing methods to enhance LLM robustness are primarily focused on perturbed data samples, whereas improving resiliency to perturbations of task-level instructions has remained relatively underexplored. In this work, we focus on character- and word-level edits of task-specific instructions, which substantially degrade downstream performance. We experiment with a variety of techniques to enhance the robustness of LLMs, including self-denoising and representation alignment, testing different models (Llama 3 and Flan-T5), datasets (CoLA, QNLI, SST-2) and instructions (both task-oriented and role-oriented). We find that, on average, self-denoising -- whether performed by a frozen LLM or a fine-tuned model -- achieves substantially higher performance gains than alternative strategies, including more complex baselines such as ensembling and supervised methods. </p> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2504.02768" title="Abstract" id="2504.02768"> arXiv:2504.02768 </a> [<a href="/pdf/2504.02768" title="Download PDF" id="pdf-2504.02768" aria-labelledby="pdf-2504.02768">pdf</a>, <a href="https://arxiv.org/html/2504.02768v1" title="View HTML" id="html-2504.02768" aria-labelledby="html-2504.02768" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02768" title="Other formats" id="oth-2504.02768" aria-labelledby="oth-2504.02768">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MultiBLiMP 1.0: A Massively Multilingual Benchmark of Linguistic Minimal Pairs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jumelet,+J">Jaap Jumelet</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weissweiler,+L">Leonie Weissweiler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bisazza,+A">Arianna Bisazza</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> We introduce MultiBLiMP 1.0, a massively multilingual benchmark of linguistic minimal pairs, covering 101 languages, 6 linguistic phenomena and containing more than 125,000 minimal pairs. Our minimal pairs are created using a fully automated pipeline, leveraging the large-scale linguistic resources of Universal Dependencies and UniMorph. MultiBLiMP 1.0 evaluates abilities of LLMs at an unprecedented multilingual scale, and highlights the shortcomings of the current state-of-the-art in modelling low-resource languages. </p> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2504.02789" title="Abstract" id="2504.02789"> arXiv:2504.02789 </a> [<a href="/pdf/2504.02789" title="Download PDF" id="pdf-2504.02789" aria-labelledby="pdf-2504.02789">pdf</a>, <a href="/format/2504.02789" title="Other formats" id="oth-2504.02789" aria-labelledby="oth-2504.02789">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Framework for Robust Cognitive Evaluation of LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=de+Langis,+K">Karin de Langis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+J+I">Jong Inn Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+B">Bin Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Le,+K+C">Khanh Chi Le</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schramm,+A">Andreas Schramm</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mensink,+M+C">Michael C. Mensink</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Elfenbein,+A">Andrew Elfenbein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+D">Dongyeop Kang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Emergent cognitive abilities in large language models (LLMs) have been widely observed, but their nature and underlying mechanisms remain poorly understood. A growing body of research draws on cognitive science to investigate LLM cognition, but standard methodologies and experimen-tal pipelines have not yet been established. To address this gap we develop CognitivEval, a framework for systematically evaluating the artificial cognitive capabilities of LLMs, with a particular emphasis on robustness in response collection. The key features of CognitivEval include: (i) automatic prompt permutations, and (ii) testing that gathers both generations and model probability estimates. Our experiments demonstrate that these features lead to more robust experimental outcomes. Using CognitivEval, we replicate five classic experiments in cognitive science, illustrating the framework's generalizability across various experimental tasks and obtaining a cognitive profile of several state of the art LLMs. CognitivEval will be released publicly to foster broader collaboration within the cognitive science community. </p> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2504.02800" title="Abstract" id="2504.02800"> arXiv:2504.02800 </a> [<a href="/pdf/2504.02800" title="Download PDF" id="pdf-2504.02800" aria-labelledby="pdf-2504.02800">pdf</a>, <a href="https://arxiv.org/html/2504.02800v1" title="View HTML" id="html-2504.02800" aria-labelledby="html-2504.02800" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02800" title="Other formats" id="oth-2504.02800" aria-labelledby="oth-2504.02800">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Survey of Large Language Models in Mental Health Disorder Detection on Social Media </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ge,+Z">Zhuohan Ge</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+N">Nicole Hu</a> (2), <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+D">Darian Li</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yubo Wang</a> (3), <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+S">Shihao Qi</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yuming Xu</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+H">Han Shi</a> (3), <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jason Zhang</a> (1) ((1) The Hong Kong Polytechnic University, (2) The Chinese University of Hong Kong, (3) Hong Kong University of Science and Technology)</div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The detection and intervention of mental health issues represent a critical global research focus, and social media data has been recognized as an important resource for mental health research. However, how to utilize Large Language Models (LLMs) for mental health problem detection on social media poses significant challenges. Hence, this paper aims to explore the potential of LLM applications in social media data analysis, focusing not only on the most common psychological disorders such as depression and anxiety but also incorporating psychotic disorders and externalizing disorders, summarizing the application methods of LLM from different dimensions, such as text data analysis and detection of mental disorders, and revealing the major challenges and shortcomings of current research. In addition, the paper provides an overview of popular datasets, and evaluation metrics. The survey in this paper provides a comprehensive frame of reference for researchers in the field of mental health, while demonstrating the great potential of LLMs in mental health detection to facilitate the further application of LLMs in future mental health interventions. </p> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2504.02807" title="Abstract" id="2504.02807"> arXiv:2504.02807 </a> [<a href="/pdf/2504.02807" title="Download PDF" id="pdf-2504.02807" aria-labelledby="pdf-2504.02807">pdf</a>, <a href="https://arxiv.org/html/2504.02807v1" title="View HTML" id="html-2504.02807" aria-labelledby="html-2504.02807" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02807" title="Other formats" id="oth-2504.02807" aria-labelledby="oth-2504.02807">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MegaMath: Pushing the Limits of Open Math Corpora </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+F">Fan Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zengzhi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ranjan,+N">Nikhil Ranjan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheng,+Z">Zhoujun Cheng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+L">Liping Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+G">Guowei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhengzhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xing,+E+P">Eric P. Xing</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 26 pages, 15 figures, 22 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Mathematical reasoning is a cornerstone of human intelligence and a key benchmark for advanced capabilities in large language models (LLMs). However, the research community still lacks an open, large-scale, high-quality corpus tailored to the demands of math-centric LLM pre-training. We present MegaMath, an open dataset curated from diverse, math-focused sources through following practices: (1) Revisiting web data: We re-extracted mathematical documents from Common Crawl with math-oriented HTML optimizations, fasttext-based filtering and deduplication, all for acquiring higher-quality data on the Internet. (2) Recalling Math-related code data: We identified high quality math-related code from large code training corpus, Stack-V2, further enhancing data diversity. (3) Exploring Synthetic data: We synthesized QA-style text, math-related code, and interleaved text-code blocks from web data or code data. By integrating these strategies and validating their effectiveness through extensive ablations, MegaMath delivers 371B tokens with the largest quantity and top quality among existing open math pre-training datasets. </p> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2504.02810" title="Abstract" id="2504.02810"> arXiv:2504.02810 </a> [<a href="/pdf/2504.02810" title="Download PDF" id="pdf-2504.02810" aria-labelledby="pdf-2504.02810">pdf</a>, <a href="https://arxiv.org/html/2504.02810v1" title="View HTML" id="html-2504.02810" aria-labelledby="html-2504.02810" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02810" title="Other formats" id="oth-2504.02810" aria-labelledby="oth-2504.02810">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Generative Evaluation of Complex Reasoning in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+H">Haowei Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Xiangyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+R">Ruilin Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+B">Baizhou Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+H">Haotian Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Jianhua Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zihao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+J">James Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+J">Jianzhu Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Y">Yitao Liang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> With powerful large language models (LLMs) demonstrating superhuman reasoning capabilities, a critical question arises: Do LLMs genuinely reason, or do they merely recall answers from their extensive, web-scraped training datasets? Publicly released benchmarks inevitably become contaminated once incorporated into subsequent LLM training sets, undermining their reliability as faithful assessments. To address this, we introduce KUMO, a generative evaluation framework designed specifically for assessing reasoning in LLMs. KUMO synergistically combines LLMs with symbolic engines to dynamically produce diverse, multi-turn reasoning tasks that are partially observable and adjustable in difficulty. Through an automated pipeline, KUMO continuously generates novel tasks across open-ended domains, compelling models to demonstrate genuine generalization rather than memorization. We evaluated 23 state-of-the-art LLMs on 5,000 tasks across 100 domains created by KUMO, benchmarking their reasoning abilities against university students. Our findings reveal that many LLMs have outperformed university-level performance on easy reasoning tasks, and reasoning-scaled LLMs reach university-level performance on complex reasoning challenges. Moreover, LLM performance on KUMO tasks correlates strongly with results on newly released real-world reasoning benchmarks, underscoring KUMO's value as a robust, enduring assessment tool for genuine LLM reasoning capabilities. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 18 of 18 entries)</h3> <dt> <a name='item39'>[39]</a> <a href ="/abs/2504.01963" title="Abstract" id="2504.01963"> arXiv:2504.01963 </a> (cross-list from cs.MA) [<a href="/pdf/2504.01963" title="Download PDF" id="pdf-2504.01963" aria-labelledby="pdf-2504.01963">pdf</a>, <a href="https://arxiv.org/html/2504.01963v1" title="View HTML" id="html-2504.01963" aria-labelledby="html-2504.01963" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.01963" title="Other formats" id="oth-2504.01963" aria-labelledby="oth-2504.01963">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLMs Working in Harmony: A Survey on the Technological Aspects of Building Effective LLM-Based Multi Agent Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Aratchige,+R+M">R. M. Aratchige</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ilmini,+W+M+K+S">W. M. K. S. Ilmini</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multiagent Systems (cs.MA)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> This survey investigates foundational technologies essential for developing effective Large Language Model (LLM)-based multi-agent systems. Aiming to answer how best to optimize these systems for collaborative, dynamic environments, we focus on four critical areas: Architecture, Memory, Planning, and Technologies/Frameworks. By analyzing recent advancements and their limitations - such as scalability, real-time response challenges, and agent coordination constraints, we provide a detailed view of the technological landscape. Frameworks like the Mixture of Agents architecture and the ReAct planning model exemplify current innovations, showcasing improvements in role assignment and decision-making. This review synthesizes key strengths and persistent challenges, offering practical recommendations to enhance system scalability, agent collaboration, and adaptability. Our findings provide a roadmap for future research, supporting the creation of robust, efficient multi-agent systems that advance both individual agent performance and collective system resilience. </p> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2504.02009" title="Abstract" id="2504.02009"> arXiv:2504.02009 </a> (cross-list from cs.CY) [<a href="/pdf/2504.02009" title="Download PDF" id="pdf-2504.02009" aria-labelledby="pdf-2504.02009">pdf</a>, <a href="https://arxiv.org/html/2504.02009v1" title="View HTML" id="html-2504.02009" aria-labelledby="html-2504.02009" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02009" title="Other formats" id="oth-2504.02009" aria-labelledby="oth-2504.02009">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Urban Computing in the Era of Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhonghang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+L">Lianghao Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ren,+X">Xubin Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+J">Jiabin Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tianyi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Y">Yong Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chao Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 36 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computers and Society (cs.CY)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Urban computing has emerged as a multidisciplinary field that harnesses data-driven technologies to address challenges and improve urban living. Traditional approaches, while beneficial, often face challenges with generalization, scalability, and contextual understanding. The advent of Large Language Models (LLMs) offers transformative potential in this domain. This survey explores the intersection of LLMs and urban computing, emphasizing the impact of LLMs in processing and analyzing urban data, enhancing decision-making, and fostering citizen engagement. We provide a concise overview of the evolution and core technologies of LLMs. Additionally, we survey their applications across key urban domains, such as transportation, public safety, and environmental monitoring, summarizing essential tasks and prior works in various urban contexts, while highlighting LLMs' functional roles and implementation patterns. Building on this, we propose potential LLM-based solutions to address unresolved challenges. To facilitate in-depth research, we compile a list of available datasets and tools applicable to diverse urban scenarios. Finally, we discuss the limitations of current approaches and outline future directions for advancing LLMs in urban computing. </p> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2504.02051" title="Abstract" id="2504.02051"> arXiv:2504.02051 </a> (cross-list from cs.MA) [<a href="/pdf/2504.02051" title="Download PDF" id="pdf-2504.02051" aria-labelledby="pdf-2504.02051">pdf</a>, <a href="https://arxiv.org/html/2504.02051v1" title="View HTML" id="html-2504.02051" aria-labelledby="html-2504.02051" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02051" title="Other formats" id="oth-2504.02051" aria-labelledby="oth-2504.02051">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Self-Resource Allocation in Multi-Agent LLM Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Amayuelas,+A">Alfonso Amayuelas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jingbo Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agashe,+S">Saaket Agashe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nagarajan,+A">Ashwin Nagarajan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Antoniades,+A">Antonis Antoniades</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X+E">Xin Eric Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">William Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multiagent Systems (cs.MA)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> With the development of LLMs as agents, there is a growing interest in connecting multiple agents into multi-agent systems to solve tasks concurrently, focusing on their role in task assignment and coordination. This paper explores how LLMs can effectively allocate computational tasks among multiple agents, considering factors such as cost, efficiency, and performance. In this work, we address key questions, including the effectiveness of LLMs as orchestrators and planners, comparing their effectiveness in task assignment and coordination. Our experiments demonstrate that LLMs can achieve high validity and accuracy in resource allocation tasks. We find that the planner method outperforms the orchestrator method in handling concurrent actions, resulting in improved efficiency and better utilization of agents. Additionally, we show that providing explicit information about worker capabilities enhances the allocation strategies of planners, particularly when dealing with suboptimal workers. </p> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2504.02107" title="Abstract" id="2504.02107"> arXiv:2504.02107 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02107" title="Download PDF" id="pdf-2504.02107" aria-labelledby="pdf-2504.02107">pdf</a>, <a href="https://arxiv.org/html/2504.02107v1" title="View HTML" id="html-2504.02107" aria-labelledby="html-2504.02107" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02107" title="Other formats" id="oth-2504.02107" aria-labelledby="oth-2504.02107">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TiC-LM: A Web-Scale Benchmark for Time-Continual LLM Pretraining </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+J">Jeffrey Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Armandpour,+M">Mohammadreza Armandpour</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mirzadeh,+I">Iman Mirzadeh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mehta,+S">Sachin Mehta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shankar,+V">Vaishaal Shankar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vemulapalli,+R">Raviteja Vemulapalli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bengio,+S">Samy Bengio</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tuzel,+O">Oncel Tuzel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Farajtabar,+M">Mehrdad Farajtabar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pouransari,+H">Hadi Pouransari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Faghri,+F">Fartash Faghri</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code available at: <a href="https://github.com/apple/ml-tic-lm" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Large Language Models (LLMs) trained on historical web data inevitably become outdated. We investigate evaluation strategies and update methods for LLMs as new data becomes available. We introduce a web-scale dataset for time-continual pretraining of LLMs derived from 114 dumps of Common Crawl (CC) - orders of magnitude larger than previous continual language modeling benchmarks. We also design time-stratified evaluations across both general CC data and specific domains (Wikipedia, StackExchange, and code documentation) to assess how well various continual learning methods adapt to new data while retaining past knowledge. Our findings demonstrate that, on general CC data, autoregressive meta-schedules combined with a fixed-ratio replay of older data can achieve comparable held-out loss to re-training from scratch, while requiring significantly less computation (2.6x). However, the optimal balance between incorporating new data and replaying old data differs as replay is crucial to avoid forgetting on generic web data but less so on specific domains. </p> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2504.02111" title="Abstract" id="2504.02111"> arXiv:2504.02111 </a> (cross-list from cs.AI) [<a href="/pdf/2504.02111" title="Download PDF" id="pdf-2504.02111" aria-labelledby="pdf-2504.02111">pdf</a>, <a href="https://arxiv.org/html/2504.02111v1" title="View HTML" id="html-2504.02111" aria-labelledby="html-2504.02111" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02111" title="Other formats" id="oth-2504.02111" aria-labelledby="oth-2504.02111">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring LLM Reasoning Through Controlled Prompt Variations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chatziveroglou,+G">Giannis Chatziveroglou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yun,+R">Richard Yun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kelleher,+M">Maura Kelleher</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> This study investigates the reasoning robustness of large language models (LLMs) on mathematical problem-solving tasks under systematically introduced input perturbations. Using the GSM8K dataset as a controlled testbed, we evaluate how well state-of-the-art models maintain logical consistency and correctness when confronted with four categories of prompt perturbations: irrelevant context, pathological instructions, factually relevant but non-essential context, and a combination of the latter two. Our experiments, conducted on thirteen open-source and closed-source LLMs, reveal that introducing irrelevant context within the model's context window significantly degrades performance, suggesting that distinguishing essential from extraneous details remains a pressing challenge. Surprisingly, performance regressions are relatively insensitive to the complexity of the reasoning task, as measured by the number of steps required, and are not strictly correlated with model size. Moreover, we observe that certain perturbations inadvertently trigger chain-of-thought-like reasoning behaviors, even without explicit prompting. Our findings highlight critical vulnerabilities in current LLMs and underscore the need for improved robustness against noisy, misleading, and contextually dense inputs, paving the way for more resilient and reliable reasoning in real-world applications. </p> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2504.02128" title="Abstract" id="2504.02128"> arXiv:2504.02128 </a> (cross-list from cs.MA) [<a href="/pdf/2504.02128" title="Download PDF" id="pdf-2504.02128" aria-labelledby="pdf-2504.02128">pdf</a>, <a href="https://arxiv.org/html/2504.02128v1" title="View HTML" id="html-2504.02128" aria-labelledby="html-2504.02128" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02128" title="Other formats" id="oth-2504.02128" aria-labelledby="oth-2504.02128">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Achieving Unanimous Consensus in Decision Making Using Multi-Agents </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Pokharel,+A">Apurba Pokharel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dantu,+R">Ram Dantu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zaman,+S">Shakila Zaman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Talapuru,+S">Sirisha Talapuru</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Quach,+V">Vinh Quach</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 9 figure, 3 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multiagent Systems (cs.MA)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Blockchain consensus mechanisms have relied on algorithms such as Proof-of-Work (PoW) and Proof-of-Stake (PoS) to ensure network functionality and integrity. However, these approaches struggle with adaptability for decision-making where the opinions of each matter rather than reaching an agreement based on honest majority or weighted consensus. This paper introduces a novel deliberation-based consensus mechanism where Large Language Models (LLMs) act as rational agents engaging in structured discussions to reach a unanimous consensus. By leveraging graded consensus and a multi-round deliberation process, our approach ensures both unanimous consensus for definitive problems and graded confidence for prioritized decisions and policies. We provide a formalization of our system and use it to show that the properties of blockchains: consistency, agreement, liveness, and determinism are maintained. Moreover, experimental results demonstrate our system's feasibility, showcasing how our deliberation method's convergence, block properties, and accuracy enable decision-making on blockchain networks. We also address key challenges with this novel approach such as degeneration of thoughts, hallucinations, malicious models and nodes, resource consumption, and scalability. </p> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2504.02144" title="Abstract" id="2504.02144"> arXiv:2504.02144 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02144" title="Download PDF" id="pdf-2504.02144" aria-labelledby="pdf-2504.02144">pdf</a>, <a href="https://arxiv.org/html/2504.02144v1" title="View HTML" id="html-2504.02144" aria-labelledby="html-2504.02144" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02144" title="Other formats" id="oth-2504.02144" aria-labelledby="oth-2504.02144">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Interpretable Soft Prompts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Patel,+O">Oam Patel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jason Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nayak,+N+S">Nikhil Shivakumar Nayak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Suraj">Suraj Srinivas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lakkaraju,+H">Himabindu Lakkaraju</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (stat.ML) </div> <p class='mathjax'> Soft prompts have been popularized as a cheap and easy way to improve task-specific LLM performance beyond few-shot prompts. Despite their origin as an automated prompting method, however, soft prompts and other trainable prompts remain a black-box method with no immediately interpretable connections to prompting. We create a novel theoretical framework for evaluating the interpretability of trainable prompts based on two desiderata: faithfulness and scrutability. We find that existing methods do not naturally satisfy our proposed interpretability criterion. Instead, our framework inspires a new direction of trainable prompting methods that explicitly optimizes for interpretability. To this end, we formulate and test new interpretability-oriented objective functions for two state-of-the-art prompt tuners: Hard Prompts Made Easy (PEZ) and RLPrompt. Our experiments with GPT-2 demonstrate a fundamental trade-off between interpretability and the task-performance of the trainable prompt, explicating the hardness of the soft prompt interpretability problem and revealing odd behavior that arises when one optimizes for an interpretability proxy. </p> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2504.02163" title="Abstract" id="2504.02163"> arXiv:2504.02163 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02163" title="Download PDF" id="pdf-2504.02163" aria-labelledby="pdf-2504.02163">pdf</a>, <a href="https://arxiv.org/html/2504.02163v1" title="View HTML" id="html-2504.02163" aria-labelledby="html-2504.02163" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02163" title="Other formats" id="oth-2504.02163" aria-labelledby="oth-2504.02163">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neural Style Transfer for Synthesising a Dataset of Ancient Egyptian Hieroglyphs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Creed,+L+M">Lewis Matheson Creed</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 50 Pages, 10 figures, Honours Thesis </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> The limited availability of training data for low-resource languages makes applying machine learning techniques challenging. Ancient Egyptian is one such language with few resources. However, innovative applications of data augmentation methods, such as Neural Style Transfer, could overcome these barriers. This paper presents a novel method for generating datasets of ancient Egyptian hieroglyphs by applying NST to a digital typeface. Experimental results found that image classification models trained on NST-generated examples and photographs demonstrate equal performance and transferability to real unseen images of hieroglyphs. </p> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2504.02234" title="Abstract" id="2504.02234"> arXiv:2504.02234 </a> (cross-list from cs.HC) [<a href="/pdf/2504.02234" title="Download PDF" id="pdf-2504.02234" aria-labelledby="pdf-2504.02234">pdf</a>, <a href="https://arxiv.org/html/2504.02234v1" title="View HTML" id="html-2504.02234" aria-labelledby="html-2504.02234" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02234" title="Other formats" id="oth-2504.02234" aria-labelledby="oth-2504.02234">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLM Social Simulations Are a Promising Research Method </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Anthis,+J+R">Jacy Reese Anthis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+R">Ryan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Richardson,+S+M">Sean M. Richardson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kozlowski,+A+C">Austin C. Kozlowski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koch,+B">Bernard Koch</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Evans,+J">James Evans</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brynjolfsson,+E">Erik Brynjolfsson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bernstein,+M">Michael Bernstein</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computers and Society (cs.CY) </div> <p class='mathjax'> Accurate and verifiable large language model (LLM) simulations of human research subjects promise an accessible data source for understanding human behavior and training new AI systems. However, results to date have been limited, and few social scientists have adopted these methods. In this position paper, we argue that the promise of LLM social simulations can be achieved by addressing five tractable challenges. We ground our argument in a literature survey of empirical comparisons between LLMs and human research subjects, commentaries on the topic, and related work. We identify promising directions with prompting, fine-tuning, and complementary methods. We believe that LLM social simulations can already be used for exploratory research, such as pilot experiments for psychology, economics, sociology, and marketing. More widespread use may soon be possible with rapidly advancing LLM capabilities, and researchers should prioritize developing conceptual models and evaluations that can be iteratively deployed and refined at pace with ongoing AI advances. </p> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2504.02268" title="Abstract" id="2504.02268"> arXiv:2504.02268 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02268" title="Download PDF" id="pdf-2504.02268" aria-labelledby="pdf-2504.02268">pdf</a>, <a href="https://arxiv.org/html/2504.02268v1" title="View HTML" id="html-2504.02268" aria-labelledby="html-2504.02268" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02268" title="Other formats" id="oth-2504.02268" aria-labelledby="oth-2504.02268">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Advancing Semantic Caching for LLMs with Domain-Specific Embeddings and Synthetic Data </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gill,+W">Waris Gill</a> (1 and 2), <a href="https://arxiv.org/search/cs?searchtype=author&query=Cechmanek,+J">Justin Cechmanek</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Hutcherson,+T">Tyler Hutcherson</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Rajamohan,+S">Srijith Rajamohan</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Agarwal,+J">Jen Agarwal</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Gulzar,+M+A">Muhammad Ali Gulzar</a> (2), <a href="https://arxiv.org/search/cs?searchtype=author&query=Singh,+M">Manvinder Singh</a> (1), <a href="https://arxiv.org/search/cs?searchtype=author&query=Dion,+B">Benoit Dion</a> ((1) Redis, (2) Virginia Tech)</div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Initial study on embedding fine tuning for semantic cache. It also explores synthetic data. Total pages are 12, including refrences </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> This report investigates enhancing semantic caching effectiveness by employing specialized, fine-tuned embedding models. Semantic caching relies on embedding similarity rather than exact key matching, presenting unique challenges in balancing precision, query latency, and computational efficiency. We propose leveraging smaller, domain-specific embedding models, fine-tuned with targeted real-world and synthetically generated datasets. Our empirical evaluations demonstrate that compact embedding models fine-tuned for just one epoch on specialized datasets significantly surpass both state-of-the-art open-source and proprietary alternatives in precision and recall. Moreover, we introduce a novel synthetic data generation pipeline for the semantic cache that mitigates the challenge of limited domain-specific annotated data, further boosting embedding performance. Our approach effectively balances computational overhead and accuracy, establishing a viable and efficient strategy for practical semantic caching implementations. </p> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2504.02507" title="Abstract" id="2504.02507"> arXiv:2504.02507 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02507" title="Download PDF" id="pdf-2504.02507" aria-labelledby="pdf-2504.02507">pdf</a>, <a href="https://arxiv.org/html/2504.02507v1" title="View HTML" id="html-2504.02507" aria-labelledby="html-2504.02507" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02507" title="Other formats" id="oth-2504.02507" aria-labelledby="oth-2504.02507">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ZClip: Adaptive Spike Mitigation for LLM Pre-Training </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+A">Abhay Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Owen,+L">Louis Owen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chowdhury,+N+R">Nilabhra Roy Chowdhury</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=G%C3%BCra,+F">Fabian G眉ra</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Training large language models (LLMs) presents numerous challenges, including gradient instability and loss spikes. These phenomena can lead to catastrophic divergence, requiring costly checkpoint restoration and data batch skipping. Traditional gradient clipping techniques, such as constant or norm-based methods, fail to address these issues effectively due to their reliance on fixed thresholds or heuristics, leading to inefficient learning and requiring frequent manual intervention. In this work, we propose ZClip, an adaptive gradient clipping algorithm that dynamically adjusts the clipping threshold based on statistical properties of gradient norms over time. Unlike prior reactive strategies, ZClip proactively adapts to training dynamics without making any prior assumptions on the scale and the temporal evolution of gradient norms. At its core, it leverages z-score-based anomaly detection to identify and mitigate large gradient spikes, preventing malignant loss spikes while not interfering with convergence otherwise. Our code is available at: <a href="https://github.com/bluorion-com/ZClip" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2504.02577" title="Abstract" id="2504.02577"> arXiv:2504.02577 </a> (cross-list from cs.AI) [<a href="/pdf/2504.02577" title="Download PDF" id="pdf-2504.02577" aria-labelledby="pdf-2504.02577">pdf</a>, <a href="/format/2504.02577" title="Other formats" id="oth-2504.02577" aria-labelledby="oth-2504.02577">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reasoning Inconsistencies and How to Mitigate Them in Deep Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Arakelyan,+E">Erik Arakelyan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> PhD thesis </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG); Logic in Computer Science (cs.LO) </div> <p class='mathjax'> The recent advancements in Deep Learning models and techniques have led to significant strides in performance across diverse tasks and modalities. However, while the overall capabilities of models show promising growth, our understanding of their internal reasoning processes remains limited, particularly concerning systematic inconsistencies or errors patterns of logical or inferential flaws. These inconsistencies may manifest as contradictory outputs, failure to generalize across similar tasks, or erroneous conclusions in specific contexts. Even detecting and measuring such reasoning discrepancies is challenging, as they may arise from opaque internal procedures, biases and imbalances in training data, or the inherent complexity of the task. Without effective methods to detect, measure, and mitigate these errors, there is a risk of deploying models that are biased, exploitable, or logically unreliable. This thesis aims to address these issues by producing novel methods for deep learning models that reason over knowledge graphs, natural language, and images. The thesis contributes two techniques for detecting and quantifying predictive inconsistencies originating from opaque internal procedures in natural language and image processing models. To mitigate inconsistencies from biases in training data, this thesis presents a data efficient sampling method to improve fairness and performance and a synthetic dataset generation approach in low resource scenarios. Finally, the thesis offers two techniques to optimize the models for complex reasoning tasks. These methods enhance model performance while allowing for more faithful and interpretable exploration and exploitation during inference. Critically, this thesis provides a comprehensive framework to improve the robustness, fairness, and interpretability of deep learning models across diverse tasks and modalities. </p> </div> </dd> <dt> <a name='item51'>[51]</a> <a href ="/abs/2504.02587" title="Abstract" id="2504.02587"> arXiv:2504.02587 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02587" title="Download PDF" id="pdf-2504.02587" aria-labelledby="pdf-2504.02587">pdf</a>, <a href="https://arxiv.org/html/2504.02587v1" title="View HTML" id="html-2504.02587" aria-labelledby="html-2504.02587" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02587" title="Other formats" id="oth-2504.02587" aria-labelledby="oth-2504.02587">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Rethinking RL Scaling for Vision Language Models: A Transparent, From-Scratch Framework and Comprehensive Evaluation Scheme </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+Y">Yan Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chern,+S">Steffi Chern</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+X">Xuyang Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+Y">Yiran Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+P">Pengfei Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Code is public and available at: <a href="https://github.com/GAIR-NLP/MAYE" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Reinforcement learning (RL) has recently shown strong potential in improving the reasoning capabilities of large language models and is now being actively extended to vision-language models (VLMs). However, existing RL applications in VLMs often rely on heavily engineered frameworks that hinder reproducibility and accessibility, while lacking standardized evaluation protocols, making it difficult to compare results or interpret training dynamics. This work introduces a transparent, from-scratch framework for RL in VLMs, offering a minimal yet functional four-step pipeline validated across multiple models and datasets. In addition, a standardized evaluation scheme is proposed to assess training dynamics and reflective behaviors. Extensive experiments on visual reasoning tasks uncover key empirical findings: response length is sensitive to random seeds, reflection correlates with output length, and RL consistently outperforms supervised fine-tuning (SFT) in generalization, even with high-quality data. These findings, together with the proposed framework, aim to establish a reproducible baseline and support broader engagement in RL-based VLM research. </p> </div> </dd> <dt> <a name='item52'>[52]</a> <a href ="/abs/2504.02605" title="Abstract" id="2504.02605"> arXiv:2504.02605 </a> (cross-list from cs.SE) [<a href="/pdf/2504.02605" title="Download PDF" id="pdf-2504.02605" aria-labelledby="pdf-2504.02605">pdf</a>, <a href="https://arxiv.org/html/2504.02605v1" title="View HTML" id="html-2504.02605" aria-labelledby="html-2504.02605" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02605" title="Other formats" id="oth-2504.02605" aria-labelledby="oth-2504.02605">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zan,+D">Daoguang Zan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Z">Zhirong Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+W">Wei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hanwu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+L">Linhao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xin,+S">Shulin Xin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Lu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Q">Qi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+X">Xiaojian Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+A">Aoyan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Siyao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+Y">Yongsheng Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+L">Liangqiang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuyu Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Su,+J">Jing Su</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+T">Tianyu Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Long,+R">Rui Long</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+K">Kai Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiang,+L">Liang Xiang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Software Engineering (cs.SE)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> The task of issue resolving is to modify a codebase to generate a patch that addresses a given issue. However, existing benchmarks, such as SWE-bench, focus almost exclusively on Python, making them insufficient for evaluating Large Language Models (LLMs) across diverse software ecosystems. To address this, we introduce a multilingual issue-resolving benchmark, called Multi-SWE-bench, covering Java, TypeScript, JavaScript, Go, Rust, C, and C++. It includes a total of 1,632 high-quality instances, which were carefully annotated from 2,456 candidates by 68 expert annotators, ensuring that the benchmark can provide an accurate and reliable evaluation. Based on Multi-SWE-bench, we evaluate a series of state-of-the-art models using three representative methods (Agentless, SWE-agent, and OpenHands) and present a comprehensive analysis with key empirical insights. In addition, we launch a Multi-SWE-RL open-source community, aimed at building large-scale reinforcement learning (RL) training datasets for issue-resolving tasks. As an initial contribution, we release a set of 4,723 well-structured instances spanning seven programming languages, laying a solid foundation for RL research in this domain. More importantly, we open-source our entire data production pipeline, along with detailed tutorials, encouraging the open-source community to continuously contribute and expand the dataset. We envision our Multi-SWE-bench and the ever-growing Multi-SWE-RL community as catalysts for advancing RL toward its full potential, bringing us one step closer to the dawn of AGI. </p> </div> </dd> <dt> <a name='item53'>[53]</a> <a href ="/abs/2504.02620" title="Abstract" id="2504.02620"> arXiv:2504.02620 </a> (cross-list from cs.LG) [<a href="/pdf/2504.02620" title="Download PDF" id="pdf-2504.02620" aria-labelledby="pdf-2504.02620">pdf</a>, <a href="https://arxiv.org/html/2504.02620v1" title="View HTML" id="html-2504.02620" aria-labelledby="html-2504.02620" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02620" title="Other formats" id="oth-2504.02620" aria-labelledby="oth-2504.02620">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient Model Editing with Task-Localized Sparse Fine-tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Iurada,+L">Leonardo Iurada</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ciccone,+M">Marco Ciccone</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tommasi,+T">Tatiana Tommasi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted ICLR 2025 - <a href="https://github.com/iurada/talos-task-arithmetic" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Task arithmetic has emerged as a promising approach for editing models by representing task-specific knowledge as composable task vectors. However, existing methods rely on network linearization to derive task vectors, leading to computational bottlenecks during training and inference. Moreover, linearization alone does not ensure weight disentanglement, the key property that enables conflict-free composition of task vectors. To address this, we propose TaLoS which allows to build sparse task vectors with minimal interference without requiring explicit linearization and sharing information across tasks. We find that pre-trained models contain a subset of parameters with consistently low gradient sensitivity across tasks, and that sparsely updating only these parameters allows for promoting weight disentanglement during fine-tuning. Our experiments prove that TaLoS improves training and inference efficiency while outperforming current methods in task addition and negation. By enabling modular parameter editing, our approach fosters practical deployment of adaptable foundation models in real-world applications. </p> </div> </dd> <dt> <a name='item54'>[54]</a> <a href ="/abs/2504.02670" title="Abstract" id="2504.02670"> arXiv:2504.02670 </a> (cross-list from cs.AI) [<a href="/pdf/2504.02670" title="Download PDF" id="pdf-2504.02670" aria-labelledby="pdf-2504.02670">pdf</a>, <a href="https://arxiv.org/html/2504.02670v1" title="View HTML" id="html-2504.02670" aria-labelledby="html-2504.02670" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02670" title="Other formats" id="oth-2504.02670" aria-labelledby="oth-2504.02670">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Affordable AI Assistants with Knowledge Graph of Thoughts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Besta,+M">Maciej Besta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Paleari,+L">Lorenzo Paleari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J+H+A">Jia Hao Andrea Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gerstenberger,+R">Robert Gerstenberger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Y">You Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Iff,+P">Patrick Iff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kubicek,+A">Ales Kubicek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nyczyk,+P">Piotr Nyczyk</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khimey,+D">Diana Khimey</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hannesson,+J+G">J贸n Gunnar Hannesson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kwa%C5%9Bniewski,+G">Grzegorz Kwa艣niewski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Copik,+M">Marcin Copik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niewiadomski,+H">Hubert Niewiadomski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hoefler,+T">Torsten Hoefler</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> Large Language Models (LLMs) are revolutionizing the development of AI assistants capable of performing diverse tasks across domains. However, current state-of-the-art LLM-driven agents face significant challenges, including high operational costs and limited success rates on complex benchmarks like GAIA. To address these issues, we propose the Knowledge Graph of Thoughts (KGoT), an innovative AI assistant architecture that integrates LLM reasoning with dynamically constructed knowledge graphs (KGs). KGoT extracts and structures task-relevant knowledge into a dynamic KG representation, iteratively enhanced through external tools such as math solvers, web crawlers, and Python scripts. Such structured representation of task-relevant knowledge enables low-cost models to solve complex tasks effectively. For example, KGoT achieves a 29% improvement in task success rates on the GAIA benchmark compared to Hugging Face Agents with GPT-4o mini, while reducing costs by over 36x compared to GPT-4o. Improvements for recent reasoning models are similar, e.g., 36% and 37.5% for Qwen2.5-32B and Deepseek-R1-70B, respectively. KGoT offers a scalable, affordable, and high-performing solution for AI assistants. </p> </div> </dd> <dt> <a name='item55'>[55]</a> <a href ="/abs/2504.02793" title="Abstract" id="2504.02793"> arXiv:2504.02793 </a> (cross-list from cs.AI) [<a href="/pdf/2504.02793" title="Download PDF" id="pdf-2504.02793" aria-labelledby="pdf-2504.02793">pdf</a>, <a href="https://arxiv.org/html/2504.02793v1" title="View HTML" id="html-2504.02793" aria-labelledby="html-2504.02793" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02793" title="Other formats" id="oth-2504.02793" aria-labelledby="oth-2504.02793">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Framework for Situating Innovations, Opportunities, and Challenges in Advancing Vertical Systems with Large AI Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Verma,+G">Gaurav Verma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jiawei Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chandra,+M">Mohit Chandra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+S">Srijan Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=De+Choudhury,+M">Munmun De Choudhury</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> pre-print; 7 pages of main content, 1 figure, 1 table </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computers and Society (cs.CY); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> Large artificial intelligence (AI) models have garnered significant attention for their remarkable, often "superhuman", performance on standardized benchmarks. However, when these models are deployed in high-stakes verticals such as healthcare, education, and law, they often reveal notable limitations. For instance, they exhibit brittleness to minor variations in input data, present contextually uninformed decisions in critical settings, and undermine user trust by confidently producing or reproducing inaccuracies. These challenges in applying large models necessitate cross-disciplinary innovations to align the models' capabilities with the needs of real-world applications. We introduce a framework that addresses this gap through a layer-wise abstraction of innovations aimed at meeting users' requirements with large models. Through multiple case studies, we illustrate how researchers and practitioners across various fields can operationalize this framework. Beyond modularizing the pipeline of transforming large models into useful "vertical systems", we also highlight the dynamism that exists within different layers of the framework. Finally, we discuss how our framework can guide researchers and practitioners to (i) optimally situate their innovations (e.g., when vertical-specific insights can empower broadly impactful vertical-agnostic innovations), (ii) uncover overlooked opportunities (e.g., spotting recurring problems across verticals to develop practically useful foundation models instead of chasing benchmarks), and (iii) facilitate cross-disciplinary communication of critical challenges (e.g., enabling a shared vocabulary for AI developers, domain experts, and human-computer interaction scholars). </p> </div> </dd> <dt> <a name='item56'>[56]</a> <a href ="/abs/2504.02828" title="Abstract" id="2504.02828"> arXiv:2504.02828 </a> (cross-list from cs.CV) [<a href="/pdf/2504.02828" title="Download PDF" id="pdf-2504.02828" aria-labelledby="pdf-2504.02828">pdf</a>, <a href="https://arxiv.org/html/2504.02828v1" title="View HTML" id="html-2504.02828" aria-labelledby="html-2504.02828" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02828" title="Other formats" id="oth-2504.02828" aria-labelledby="oth-2504.02828">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Concept Lancet: Image Editing with Compositional Representation Transplant </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Luo,+J">Jinqi Luo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+T">Tianjiao Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chan,+K+H+R">Kwan Ho Ryan Chan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Min,+H">Hancheng Min</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Callison-Burch,+C">Chris Callison-Burch</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vidal,+R">Ren茅 Vidal</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted in CVPR 2025. Project page at <a href="https://peterljq.github.io/project/colan" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Diffusion models are widely used for image editing tasks. Existing editing methods often design a representation manipulation procedure by curating an edit direction in the text embedding or score space. However, such a procedure faces a key challenge: overestimating the edit strength harms visual consistency while underestimating it fails the editing task. Notably, each source image may require a different editing strength, and it is costly to search for an appropriate strength via trial-and-error. To address this challenge, we propose Concept Lancet (CoLan), a zero-shot plug-and-play framework for principled representation manipulation in diffusion-based image editing. At inference time, we decompose the source input in the latent (text embedding or diffusion score) space as a sparse linear combination of the representations of the collected visual concepts. This allows us to accurately estimate the presence of concepts in each image, which informs the edit. Based on the editing task (replace/add/remove), we perform a customized concept transplant process to impose the corresponding editing direction. To sufficiently model the concept space, we curate a conceptual representation dataset, CoLan-150K, which contains diverse descriptions and scenarios of visual terms and phrases for the latent dictionary. Experiments on multiple diffusion-based image editing baselines show that methods equipped with CoLan achieve state-of-the-art performance in editing effectiveness and consistency preservation. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 46 of 46 entries)</h3> <dt> <a name='item57'>[57]</a> <a href ="/abs/2402.02791" title="Abstract" id="2402.02791"> arXiv:2402.02791 </a> (replaced) [<a href="/pdf/2402.02791" title="Download PDF" id="pdf-2402.02791" aria-labelledby="pdf-2402.02791">pdf</a>, <a href="https://arxiv.org/html/2402.02791v4" title="View HTML" id="html-2402.02791" aria-labelledby="html-2402.02791" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2402.02791" title="Other formats" id="oth-2402.02791" aria-labelledby="oth-2402.02791">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PanGu-$蟺$ Pro:Rethinking Optimization and Architecture for Tiny Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tang,+Y">Yehui Tang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+K">Kai Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+F">Fangcheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ni,+Y">Yunsheng Ni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yuchuan Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bai,+Z">Zheyuan Bai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+Y">Yi-Qi Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+S">Sichao Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jui,+S">Shangling Jui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yunhe Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The power of large language models (LLMs) has been demonstrated through numerous data and computing resources. However, the application of language models on mobile devices is facing huge challenge on the computation and memory costs, that is, tiny language models with high performance are urgently required. Limited by the highly complex training process, there are many details for optimizing language models that are seldom studied carefully. In this study, based on a tiny language model with 1B parameters, we carefully design a series of empirical study to analyze the effect of each component. Three perspectives are mainly discussed, \ie, neural architecture, parameter initialization, and optimization strategy. Several design formulas are empirically proved especially effective for tiny language models, including tokenizer compression, architecture tweaking, parameter inheritance and multiple-round training. Then we train PanGu-$\pi$-1B Pro and PanGu-$\pi$-1.5B Pro on 1.6T multilingual corpora, following the established formulas. Experimental results demonstrate the improved optimization and architecture yield a notable average improvement of 8.87 on benchmark evaluation sets for PanGu-$\pi$-1B Pro. Besides, PanGu-$\pi$-1.5B Pro surpasses a range of SOTA models with larger model sizes, validating its superior performance. The code is available at <a href="https://github.com/YuchuanTian/RethinkTinyLM" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item58'>[58]</a> <a href ="/abs/2404.00699" title="Abstract" id="2404.00699"> arXiv:2404.00699 </a> (replaced) [<a href="/pdf/2404.00699" title="Download PDF" id="pdf-2404.00699" aria-labelledby="pdf-2404.00699">pdf</a>, <a href="https://arxiv.org/html/2404.00699v4" title="View HTML" id="html-2404.00699" aria-labelledby="html-2404.00699" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.00699" title="Other formats" id="oth-2404.00699" aria-labelledby="oth-2404.00699">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Comprehensive Survey of Contamination Detection Methods in Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ravaut,+M">Mathieu Ravaut</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+B">Bosheng Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiao,+F">Fangkai Jiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+H">Hailin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xingxuan Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+R">Ruochen Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+C">Chengwei Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+C">Caiming Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Joty,+S">Shafiq Joty</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 1 figure, 2 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> With the rise of Large Language Models (LLMs) in recent years, abundant new opportunities are emerging, but also new challenges, among which contamination is quickly becoming critical. Business applications and fundraising in Artificial Intelligence (AI) have reached a scale at which a few percentage points gained on popular question-answering benchmarks could translate into dozens of millions of dollars, placing high pressure on model integrity. At the same time, it is becoming harder and harder to keep track of the data that LLMs have seen; if not impossible with closed-source models like GPT-4 and Claude-3 not divulging any information on the training set. As a result, contamination becomes a major issue: LLMs' performance may not be reliable anymore, as the high performance may be at least partly due to their previous exposure to the data. This limitation jeopardizes real capability improvement in the field of NLP, yet, there remains a lack of methods on how to efficiently detect contamination. In this paper, we survey all recent work on contamination detection with LLMs, analyzing their methodologies and use cases to shed light on the appropriate usage of contamination detection methods. Our work calls the NLP research community's attention into systematically taking into account contamination bias in LLM evaluation. </p> </div> </dd> <dt> <a name='item59'>[59]</a> <a href ="/abs/2404.12494" title="Abstract" id="2404.12494"> arXiv:2404.12494 </a> (replaced) [<a href="/pdf/2404.12494" title="Download PDF" id="pdf-2404.12494" aria-labelledby="pdf-2404.12494">pdf</a>, <a href="https://arxiv.org/html/2404.12494v3" title="View HTML" id="html-2404.12494" aria-labelledby="html-2404.12494" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2404.12494" title="Other formats" id="oth-2404.12494" aria-labelledby="oth-2404.12494">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BIRD: A Trustworthy Bayesian Inference Framework for Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+Y">Yu Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+B">Ben Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+W">Weidong Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roth,+D">Dan Roth</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> ICLR 2025 (Oral) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Predictive models often need to work with incomplete information in real-world tasks. Consequently, they must provide reliable probability or confidence estimation, especially in large-scale decision-making and planning tasks. Current large language models (LLMs) are insufficient for accurate estimations, but they can generate relevant factors that may affect the probabilities, produce coarse-grained probabilities when the information is more complete, and help determine which factors are relevant to specific downstream contexts. In this paper, we make use of these capabilities of LLMs to provide a significantly more accurate probabilistic estimation. We propose BIRD, a novel probabilistic inference framework that aligns a Bayesian network with LLM abductions and then estimates more accurate probabilities in a deduction step. We show BIRD provides reliable probability estimations that are 30% better than those provided directly by LLM baselines. These estimates further contribute to better and more trustworthy decision making. </p> </div> </dd> <dt> <a name='item60'>[60]</a> <a href ="/abs/2405.12084" title="Abstract" id="2405.12084"> arXiv:2405.12084 </a> (replaced) [<a href="/pdf/2405.12084" title="Download PDF" id="pdf-2405.12084" aria-labelledby="pdf-2405.12084">pdf</a>, <a href="/format/2405.12084" title="Other formats" id="oth-2405.12084" aria-labelledby="oth-2405.12084">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Distributional Semantics, Holism, and the Instability of Meaning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Grindrod,+J">Jumbly Grindrod</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Porter,+J">J.D. Porter</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hansen,+N">Nat Hansen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models are built on the so-called distributional semantic approach to linguistic meaning that has the distributional hypothesis at its core. The distributional hypothesis involves a holistic conception of word meaning: the meaning of a word depends upon its relations to other words in the model. A standard objection to holism is the charge of instability: any change in the meaning properties of a linguistic system (a human speaker, for example) would lead to many changes or a complete change in the entire system. We examine whether the instability objection poses a problem for distributional models of meaning. First, we distinguish between distinct forms of instability that these models could exhibit, and argue that only one such form is relevant for understanding the relation between instability and communication: what we call differential instability. Differential instability is variation in the relative distances between points in a space, rather than variation in the absolute position of those points. We distinguish differential and absolute instability by constructing two of our own smaller language models. We demonstrate the two forms of instability by showing these models change as the corpora they are constructed from increase in size. We argue that the instability that these models display is constrained by the structure and scale of relationships between words, such that the resistance to change for a word is roughly proportional to its frequent and consistent use within the language system. The differential instability that language models exhibit allows for productive forms of meaning change while not leading to the problems raised by the instability objection. </p> </div> </dd> <dt> <a name='item61'>[61]</a> <a href ="/abs/2406.17961" title="Abstract" id="2406.17961"> arXiv:2406.17961 </a> (replaced) [<a href="/pdf/2406.17961" title="Download PDF" id="pdf-2406.17961" aria-labelledby="pdf-2406.17961">pdf</a>, <a href="https://arxiv.org/html/2406.17961v2" title="View HTML" id="html-2406.17961" aria-labelledby="html-2406.17961" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.17961" title="Other formats" id="oth-2406.17961" aria-labelledby="oth-2406.17961">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NormTab: Improving Symbolic Reasoning in LLMs Through Tabular Data Normalization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nahid,+M+M+H">Md Mahadi Hasan Nahid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rafiei,+D">Davood Rafiei</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> EMNLP 2024 (Findings) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Databases (cs.DB); Information Retrieval (cs.IR) </div> <p class='mathjax'> In recent years, Large Language Models (LLMs) have demonstrated remarkable capabilities in parsing textual data and generating code. However, their performance in tasks involving tabular data, especially those requiring symbolic reasoning, faces challenges due to the structural variance and inconsistency in table cell values often found in web tables. In this paper, we introduce NormTab, a novel framework aimed at enhancing the symbolic reasoning performance of LLMs by normalizing web tables. We study table normalization as a stand-alone, one-time preprocessing step using LLMs to support symbolic reasoning on tabular data. Our experimental evaluation, conducted on challenging web table datasets such as WikiTableQuestion and TabFact, demonstrates that leveraging NormTab significantly improves symbolic reasoning performance, showcasing the importance and effectiveness of web table normalization for enhancing LLM-based symbolic reasoning tasks. </p> </div> </dd> <dt> <a name='item62'>[62]</a> <a href ="/abs/2407.06249" title="Abstract" id="2407.06249"> arXiv:2407.06249 </a> (replaced) [<a href="/pdf/2407.06249" title="Download PDF" id="pdf-2407.06249" aria-labelledby="pdf-2407.06249">pdf</a>, <a href="/format/2407.06249" title="Other formats" id="oth-2407.06249" aria-labelledby="oth-2407.06249">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CodeUpdateArena: Benchmarking Knowledge Editing on API Updates </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z+L">Zeyu Leo Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pandit,+S">Shrey Pandit</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+X">Xi Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Choi,+E">Eunsol Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Durrett,+G">Greg Durrett</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Software Engineering (cs.SE) </div> <p class='mathjax'> Large language models (LLMs) are increasingly being used to synthesize and reason about source code. However, the static nature of these models' knowledge does not reflect the fact that libraries and API functions they invoke are continuously evolving, with functionality being added or changing. While numerous benchmarks evaluate how LLMs can generate code, no prior work has studied how an LLMs' knowledge about code API functions can be updated. To fill this gap, we present CodeUpdateArena, a benchmark for knowledge editing in the code domain. An instance in our benchmark consists of a synthetic API function update paired with a program synthesis example that uses the updated functionality; our goal is to update an LLM to be able to solve this program synthesis example without providing documentation of the update at inference time. Compared to knowledge editing for facts encoded in text, success here is more challenging: a code LLM must correctly reason about the semantics of the modified function rather than just reproduce its syntax. Our dataset is constructed by first prompting GPT-4 to generate atomic and executable function updates. Then, for each update, we generate program synthesis examples whose code solutions are prone to use the update. Our benchmark covers updates of various types to 54 functions from seven diverse Python packages, with a total of 670 program synthesis examples. Our experiments show that prepending documentation of the update to open-source code LLMs (i.e., DeepSeek, CodeLlama) does not allow them to incorporate changes for problem solving, and existing knowledge editing techniques also have substantial room for improvement. We hope our benchmark will inspire new methods for knowledge updating in code LLMs. </p> </div> </dd> <dt> <a name='item63'>[63]</a> <a href ="/abs/2407.09495" title="Abstract" id="2407.09495"> arXiv:2407.09495 </a> (replaced) [<a href="/pdf/2407.09495" title="Download PDF" id="pdf-2407.09495" aria-labelledby="pdf-2407.09495">pdf</a>, <a href="https://arxiv.org/html/2407.09495v3" title="View HTML" id="html-2407.09495" aria-labelledby="html-2407.09495" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.09495" title="Other formats" id="oth-2407.09495" aria-labelledby="oth-2407.09495">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Image captioning in different languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=van+Miltenburg,+E">Emiel van Miltenburg</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> This short position paper provides a manually curated list of non-English image captioning datasets (as of May 2024). Through this list, we can observe the dearth of datasets in different languages: only 23 different languages are represented. With the addition of the Crossmodal-3600 dataset (Thapliyal et al., 2022, 36 languages) this number increases somewhat, but still this number is small compared to the +/-500 institutional languages that are out there. This paper closes with some open questions for the field of Vision & Language. </p> </div> </dd> <dt> <a name='item64'>[64]</a> <a href ="/abs/2407.11606" title="Abstract" id="2407.11606"> arXiv:2407.11606 </a> (replaced) [<a href="/pdf/2407.11606" title="Download PDF" id="pdf-2407.11606" aria-labelledby="pdf-2407.11606">pdf</a>, <a href="https://arxiv.org/html/2407.11606v4" title="View HTML" id="html-2407.11606" aria-labelledby="html-2407.11606" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.11606" title="Other formats" id="oth-2407.11606" aria-labelledby="oth-2407.11606">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The Foundations of Tokenization: Statistical and Computational Concerns </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gastaldi,+J+L">Juan Luis Gastaldi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Terilla,+J">John Terilla</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Malagutti,+L">Luca Malagutti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=DuSell,+B">Brian DuSell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vieira,+T">Tim Vieira</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cotterell,+R">Ryan Cotterell</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Tokenization - the practice of converting strings of characters from an alphabet into sequences of tokens over a vocabulary - is a critical step in the NLP pipeline. The use of token representations is widely credited with increased model performance but is also the source of many undesirable behaviors, such as spurious ambiguity or inconsistency. Despite its recognized importance as a standard representation method in NLP, the theoretical underpinnings of tokenization are not yet fully understood. In particular, the impact of tokenization on language model estimation has been investigated primarily through empirical means. The present paper contributes to addressing this theoretical gap by proposing a unified formal framework for representing and analyzing tokenizer models. Based on the category of stochastic maps, this framework enables us to establish general conditions for a principled use of tokenizers and, most importantly, the necessary and sufficient conditions for a tokenizer model to preserve the consistency of statistical estimators. In addition, we discuss statistical and computational concerns crucial for designing and implementing tokenizer models, such as inconsistency, ambiguity, finiteness, and sequentiality. The framework and results advanced in this paper contribute to building robust theoretical foundations for representations in neural language modeling that can inform future theoretical and empirical research. </p> </div> </dd> <dt> <a name='item65'>[65]</a> <a href ="/abs/2410.01100" title="Abstract" id="2410.01100"> arXiv:2410.01100 </a> (replaced) [<a href="/pdf/2410.01100" title="Download PDF" id="pdf-2410.01100" aria-labelledby="pdf-2410.01100">pdf</a>, <a href="https://arxiv.org/html/2410.01100v3" title="View HTML" id="html-2410.01100" aria-labelledby="html-2410.01100" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.01100" title="Other formats" id="oth-2410.01100" aria-labelledby="oth-2410.01100">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unlocking Korean Verbs: A User-Friendly Exploration into the Verb Lexicon </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+S">Seohyun Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jo,+E+L">Eunkyul Leah Jo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yige Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+J">Jeen-Pyo Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+K">Kyuwon Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wee,+J">Jin Wee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kang,+M">Miyoung Kang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lim,+K">KyungTae Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+J">Jungyeul Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+C">Chulwoo Park</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> NAACL 2025 System Demonstrations </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> The Sejong dictionary dataset offers a valuable resource, providing extensive coverage of morphology, syntax, and semantic representation. This dataset can be utilized to explore linguistic information in greater depth. The labeled linguistic structures within this dataset form the basis for uncovering relationships between words and phrases and their associations with target verbs. This paper introduces a user-friendly web interface designed for the collection and consolidation of verb-related information, with a particular focus on subcategorization frames. Additionally, it outlines our efforts in mapping this information by aligning subcategorization frames with corresponding illustrative sentence examples. Furthermore, we provide a Python library that would simplify syntactic parsing and semantic role labeling. These tools are intended to assist individuals interested in harnessing the Sejong dictionary dataset to develop applications for Korean language processing. </p> </div> </dd> <dt> <a name='item66'>[66]</a> <a href ="/abs/2410.02660" title="Abstract" id="2410.02660"> arXiv:2410.02660 </a> (replaced) [<a href="/pdf/2410.02660" title="Download PDF" id="pdf-2410.02660" aria-labelledby="pdf-2410.02660">pdf</a>, <a href="https://arxiv.org/html/2410.02660v2" title="View HTML" id="html-2410.02660" aria-labelledby="html-2410.02660" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02660" title="Other formats" id="oth-2410.02660" aria-labelledby="oth-2410.02660">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> How to Train Long-Context Language Models (Effectively) </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+T">Tianyu Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wettig,+A">Alexander Wettig</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yen,+H">Howard Yen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+D">Danqi Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Our code, data, and models are available at <a href="https://github.com/princeton-nlp/ProLong" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> We study continued training and supervised fine-tuning (SFT) of a language model (LM) to make effective use of long-context information. We first establish a reliable evaluation protocol to guide model development -- instead of perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set of long-context downstream tasks, and we evaluate models after SFT as this better reveals long-context abilities. Supported by our robust evaluations, we run thorough experiments to decide the data mix for continued pre-training, the instruction tuning dataset, and many other design choices such as position extrapolation. We find that (1) code repositories and books are excellent sources of long data, but it is crucial to combine them with high-quality short-context data; (2) training with a sequence length beyond the evaluation length boosts long-context performance; (3) for SFT, using only short instruction datasets yields strong performance on long-context tasks. Our final model, ProLong-8B, which is initialized from Llama-3 and trained on 40B tokens, demonstrates state-of-the-art long-context performance among similarly sized models at a length of 128K. ProLong outperforms Llama-3.1-8B-Instruct on the majority of long-context tasks despite using only 5% as many tokens during long-context training. Additionally, ProLong can effectively process up to 512K tokens, one of the longest context windows of publicly available LMs. </p> </div> </dd> <dt> <a name='item67'>[67]</a> <a href ="/abs/2410.03804" title="Abstract" id="2410.03804"> arXiv:2410.03804 </a> (replaced) [<a href="/pdf/2410.03804" title="Download PDF" id="pdf-2410.03804" aria-labelledby="pdf-2410.03804">pdf</a>, <a href="/format/2410.03804" title="Other formats" id="oth-2410.03804" aria-labelledby="oth-2410.03804">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixture of Attentions For Speculative Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zimmer,+M">Matthieu Zimmer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gritta,+M">Milan Gritta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lampouras,+G">Gerasimos Lampouras</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ammar,+H+B">Haitham Bou Ammar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jun Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at International Conference on Learning Representations (ICLR 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The growth in the number of parameters of Large Language Models (LLMs) has led to a significant surge in computational requirements, making them challenging and costly to deploy. Speculative decoding (SD) leverages smaller models to efficiently propose future tokens, which are then verified by the LLM in parallel. Small models that utilise activations from the LLM currently achieve the fastest decoding speeds. However, we identify several limitations of SD models including the lack of on-policyness during training and partial observability. To address these shortcomings, we propose a more grounded architecture for small models by introducing a Mixture of Attentions for SD. Our novel architecture can be applied in two scenarios: a conventional single device deployment and a novel client-server deployment where the small model is hosted on a consumer device and the LLM on a server. In a single-device scenario, we demonstrate state-of-the-art speedups improving EAGLE-2 by 9.5% and its acceptance length by 25%. In a client-server setting, our experiments demonstrate: 1) state-of-the-art latencies with minimal calls to the server for different network conditions, and 2) in the event of a complete disconnection, our approach can maintain higher accuracy compared to other SD methods and demonstrates advantages over API calls to LLMs, which would otherwise be unable to continue the generation process. </p> </div> </dd> <dt> <a name='item68'>[68]</a> <a href ="/abs/2410.12491" title="Abstract" id="2410.12491"> arXiv:2410.12491 </a> (replaced) [<a href="/pdf/2410.12491" title="Download PDF" id="pdf-2410.12491" aria-labelledby="pdf-2410.12491">pdf</a>, <a href="https://arxiv.org/html/2410.12491v2" title="View HTML" id="html-2410.12491" aria-labelledby="html-2410.12491" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.12491" title="Other formats" id="oth-2410.12491" aria-labelledby="oth-2410.12491">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Insights from the Inverse: Reconstructing LLM Training Goals Through Inverse Reinforcement Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Joselowitz,+J">Jared Joselowitz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Majumdar,+R">Ritam Majumdar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jagota,+A">Arjun Jagota</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bou,+M">Matthieu Bou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Patel,+N">Nyal Patel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krishna,+S">Satyapriya Krishna</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Parbhoo,+S">Sonali Parbhoo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint. Under Review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large language models (LLMs) trained with Reinforcement Learning from Human Feedback (RLHF) have demonstrated remarkable capabilities, but their underlying reward functions and decision-making processes remain opaque. This paper introduces a novel approach to interpreting LLMs by applying inverse reinforcement learning (IRL) to recover their implicit reward functions. We conduct experiments on toxicity-aligned LLMs of varying sizes, extracting reward models that achieve up to 85\% accuracy in predicting human preferences. Our analysis reveals key insights into the non-identifiability of reward functions, the relationship between model size and interpretability, and potential pitfalls in the RLHF process. We demonstrate that IRL-derived reward models can be used to fine-tune new LLMs, resulting in comparable or improved performance on toxicity benchmarks. This work provides a new lens for understanding and improving LLM alignment, with implications for the responsible development and deployment of these powerful systems. </p> </div> </dd> <dt> <a name='item69'>[69]</a> <a href ="/abs/2410.15316" title="Abstract" id="2410.15316"> arXiv:2410.15316 </a> (replaced) [<a href="/pdf/2410.15316" title="Download PDF" id="pdf-2410.15316" aria-labelledby="pdf-2410.15316">pdf</a>, <a href="https://arxiv.org/html/2410.15316v2" title="View HTML" id="html-2410.15316" aria-labelledby="html-2410.15316" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.15316" title="Other formats" id="oth-2410.15316" aria-labelledby="oth-2410.15316">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Ichigo: Mixed-Modal Early-Fusion Realtime Voice Assistant </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dao,+A">Alan Dao</a> (Gia Tuan Dao), <a href="https://arxiv.org/search/cs?searchtype=author&query=Vu,+D+B">Dinh Bach Vu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ha,+H+H">Huy Hoang Ha</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Large Language Models (LLMs) have revolutionized natural language processing, but their application to speech-based tasks remains challenging due to the complexities of integrating audio and text modalities. This paper introduces Ichigo, a mixed-modal model that seamlessly processes interleaved sequences of speech and text. Utilizing a tokenized early-fusion approach, Ichigo quantizes speech into discrete tokens and employs a uniform transformer-based architecture for both speech and text modalities. This method enables joint reasoning and generation across modalities without the need for separate adapters. We present a comprehensive training methodology, including pre-training on multilingual speech recognition datasets and fine-tuning on a curated instruction dataset. Ichigo demonstrates state-of-the-art performance on speech question-answering benchmarks, outperforming existing open-source speech language models and achieving comparable results to cascaded systems. Notably, Ichigo exhibits a latency of just 111 ms to first token generation, significantly lower than current models. Our approach not only advances the field of multimodal AI but also provides a framework for smaller research teams to contribute effectively to open-source speech-language models. </p> </div> </dd> <dt> <a name='item70'>[70]</a> <a href ="/abs/2412.07992" title="Abstract" id="2412.07992"> arXiv:2412.07992 </a> (replaced) [<a href="/pdf/2412.07992" title="Download PDF" id="pdf-2412.07992" aria-labelledby="pdf-2412.07992">pdf</a>, <a href="https://arxiv.org/html/2412.07992v3" title="View HTML" id="html-2412.07992" aria-labelledby="html-2412.07992" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.07992" title="Other formats" id="oth-2412.07992" aria-labelledby="oth-2412.07992">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Concept Bottleneck Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+C">Chung-En Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Oikarinen,+T">Tuomas Oikarinen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ustun,+B">Berk Ustun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Weng,+T">Tsui-Wei Weng</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICLR 2025. arXiv admin note: substantial text overlap with <a href="https://arxiv.org/abs/2407.04307" data-arxiv-id="2407.04307" class="link-https">arXiv:2407.04307</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> We introduce Concept Bottleneck Large Language Models (CB-LLMs), a novel framework for building inherently interpretable Large Language Models (LLMs). In contrast to traditional black-box LLMs that rely on limited post-hoc interpretations, CB-LLMs integrate intrinsic interpretability directly into the LLMs -- allowing accurate explanations with scalability and transparency. We build CB-LLMs for two essential NLP tasks: text classification and text generation. In text classification, CB-LLMs is competitive with, and at times outperforms, traditional black-box models while providing explicit and interpretable reasoning. For the more challenging task of text generation, interpretable neurons in CB-LLMs enable precise concept detection, controlled generation, and safer outputs. The embedded interpretability empowers users to transparently identify harmful content, steer model behavior, and unlearn undesired concepts -- significantly enhancing the safety, reliability, and trustworthiness of LLMs, which are critical capabilities notably absent in existing models. Our code is available at <a href="https://github.com/Trustworthy-ML-Lab/CB-LLMs" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item71'>[71]</a> <a href ="/abs/2412.12386" title="Abstract" id="2412.12386"> arXiv:2412.12386 </a> (replaced) [<a href="/pdf/2412.12386" title="Download PDF" id="pdf-2412.12386" aria-labelledby="pdf-2412.12386">pdf</a>, <a href="https://arxiv.org/html/2412.12386v2" title="View HTML" id="html-2412.12386" aria-labelledby="html-2412.12386" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.12386" title="Other formats" id="oth-2412.12386" aria-labelledby="oth-2412.12386">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Interpretable LLM-based Table Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+G">Giang Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Brugere,+I">Ivan Brugere</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sharma,+S">Shubham Sharma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kariyappa,+S">Sanjay Kariyappa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+A+T">Anh Totti Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lecue,+F">Freddy Lecue</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 10 pages, 2 figures and 9 tables in the main text </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Interpretability for Table Question Answering (Table QA) is critical, particularly in high-stakes industries like finance or healthcare. Although recent approaches using Large Language Models (LLMs) have significantly improved Table QA performance, their explanations for how the answers are generated are ambiguous. To fill this gap, we introduce Plan-of-SQLs (POS), an interpretable Table QA approach designed to improve users' understanding of model decision-making. Through qualitative and quantitative evaluations with human and LLM judges, we show that: First, POS is the highest-quality explanation method, helps human users understand model behaviors, and facilitates model prediction verification. Second, when evaluated on popular and standard Table QA datasets (TabFact, WikiTQ, and FetaQA), POS achieves QA accuracy that is competitive with or superior to existing methods, while also offering greater efficiency-requiring significantly fewer LLM calls and table database queries-and robust performance on large-sized tables. Finally, we observe high agreement (up to 90%) between LLMs and human users when making decisions based on the same explanations, suggesting that LLMs could serve as an effective proxy for humans in evaluating explanations. This finding enables faster, more affordable evaluation of AI explanations-possibly accelerating trustworthy AI research while maintaining reliable judgments on interpretability. </p> </div> </dd> <dt> <a name='item72'>[72]</a> <a href ="/abs/2412.17867" title="Abstract" id="2412.17867"> arXiv:2412.17867 </a> (replaced) [<a href="/pdf/2412.17867" title="Download PDF" id="pdf-2412.17867" aria-labelledby="pdf-2412.17867">pdf</a>, <a href="https://arxiv.org/html/2412.17867v2" title="View HTML" id="html-2412.17867" aria-labelledby="html-2412.17867" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.17867" title="Other formats" id="oth-2412.17867" aria-labelledby="oth-2412.17867">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating and Enhancing LLMs for Multi-turn Text-to-SQL with Multiple Question Types </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Ziming Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+C">Chao Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yinggang Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+T">Tiancheng Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+G">Guangyao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+H">Hai Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> International Joint Conference on Neural Networks 2025 (IJCNN 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Recent advancements in large language models (LLMs) have significantly advanced text-to-SQL systems. However, most LLM-based methods often narrowly focus on SQL generation, neglecting the complexities of real-world conversational queries. This oversight can lead to unreliable responses, particularly for ambiguous questions that cannot be directly addressed with SQL. To bridge this gap, we propose MMSQL, a comprehensive test suite designed to evaluate the question classification and SQL generation capabilities of LLMs by simulating real-world scenarios with diverse question types and multi-turn Q\&A interactions. Using MMSQL, we assessed the performance of popular LLMs, including both open-source and closed-source models, and identified key factors impacting their performance in such scenarios. Moreover, we introduce an LLM-based multi-agent framework that employs specialized agents to identify question types and determine appropriate answering strategies. Our experiments demonstrate that this approach significantly enhances the model's ability to navigate the complexities of conversational dynamics, effectively handling the diverse and complex nature of user queries. Our dataset and code are publicly available at <a href="https://mcxiaoxiao.github.io/MMSQL" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item73'>[73]</a> <a href ="/abs/2501.00164" title="Abstract" id="2501.00164"> arXiv:2501.00164 </a> (replaced) [<a href="/pdf/2501.00164" title="Download PDF" id="pdf-2501.00164" aria-labelledby="pdf-2501.00164">pdf</a>, <a href="/format/2501.00164" title="Other formats" id="oth-2501.00164" aria-labelledby="oth-2501.00164">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Measuring Large Language Models Capacity to Annotate Journalistic Sourcing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Vincent,+S">Subramaniam Vincent</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+P">Phoebe Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shi,+Z">Zhan Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koka,+S">Sahas Koka</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fang,+Y">Yi Fang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY) </div> <p class='mathjax'> Since the launch of ChatGPT in late 2022, the capacities of Large Language Models and their evaluation have been in constant discussion and evaluation both in academic research and in the industry. Scenarios and benchmarks have been developed in several areas such as law, medicine and math (Bommasani et al., 2023) and there is continuous evaluation of model variants. One area that has not received sufficient scenario development attention is journalism, and in particular journalistic sourcing and ethics. Journalism is a crucial truth-determination function in democracy (Vincent, 2023), and sourcing is a crucial pillar to all original journalistic output. Evaluating the capacities of LLMs to annotate stories for the different signals of sourcing and how reporters justify them is a crucial scenario that warrants a benchmark approach. It offers potential to build automated systems to contrast more transparent and ethically rigorous forms of journalism with everyday fare. In this paper we lay out a scenario to evaluate LLM performance on identifying and annotating sourcing in news stories on a five-category schema inspired from journalism studies (Gans, 2004). We offer the use case, our dataset and metrics and as the first step towards systematic benchmarking. Our accuracy findings indicate LLM-based approaches have more catching to do in identifying all the sourced statements in a story, and equally, in matching the type of sources. An even harder task is spotting source justifications. </p> </div> </dd> <dt> <a name='item74'>[74]</a> <a href ="/abs/2501.03262" title="Abstract" id="2501.03262"> arXiv:2501.03262 </a> (replaced) [<a href="/pdf/2501.03262" title="Download PDF" id="pdf-2501.03262" aria-labelledby="pdf-2501.03262">pdf</a>, <a href="https://arxiv.org/html/2501.03262v2" title="View HTML" id="html-2501.03262" aria-labelledby="html-2501.03262" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.03262" title="Other formats" id="oth-2501.03262" aria-labelledby="oth-2501.03262">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> REINFORCE++: An Efficient RLHF Algorithm with Robustness to Both Prompt and Reward Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hu,+J">Jian Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J+K">Jason Klein Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+S">Shen Wei</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> this is a tech report </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Reinforcement Learning from Human Feedback (RLHF) plays a crucial role in aligning large language models (LLMs) with human values and preferences. While state-of-the-art applications like ChatGPT/GPT-4 commonly employ Proximal Policy Optimization (PPO), the inclusion of a critic network introduces significant computational overhead. REINFORCE-based methods, such as REINFORCE Leave One-Out (RLOO), ReMax, and Group Relative Policy Optimization (GRPO), address this limitation by eliminating the critic network. However, these approaches face challenges in accurate advantage estimation. Specifically, they estimate advantages independently for responses to each prompt, which can lead to overfitting on simpler prompts and vulnerability to reward hacking. To address these challenges, we introduce REINFORCE++, a novel approach that removes the critic model while using the normalized reward of a batch as the baseline. Our empirical evaluation demonstrates that REINFORCE++ exhibits robust performance across various reward models without requiring prompt set truncation. Furthermore, it achieves superior generalization in both RLHF and long chain-of-thought (CoT) settings compared to existing REINFORCE-based methods. The implementation is available at <a href="https://github.com/OpenRLHF/OpenRLHF" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item75'>[75]</a> <a href ="/abs/2502.11123" title="Abstract" id="2502.11123"> arXiv:2502.11123 </a> (replaced) [<a href="/pdf/2502.11123" title="Download PDF" id="pdf-2502.11123" aria-labelledby="pdf-2502.11123">pdf</a>, <a href="https://arxiv.org/html/2502.11123v3" title="View HTML" id="html-2502.11123" aria-labelledby="html-2502.11123" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11123" title="Other formats" id="oth-2502.11123" aria-labelledby="oth-2502.11123">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DuplexMamba: Enhancing Real-time Speech Conversations with Duplex and Streaming Capabilities </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+X">Xiangyu Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haoyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+H">Hongyun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+H">Haiyan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+C">Conghui Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+T">Tiejun Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+M">Muyun Yang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Real-time speech conversation is essential for natural and efficient human-machine interactions, requiring duplex and streaming capabilities. Traditional Transformer-based conversational chatbots operate in a turn-based manner and exhibit quadratic computational complexity that grows as the input size increases. In this paper, we propose DuplexMamba, a Mamba-based end-to-end multimodal duplex model for speech-to-text conversation. DuplexMamba enables simultaneous input processing and output generation, dynamically adjusting to support real-time streaming. Specifically, we develop a Mamba-based speech encoder and adapt it with a Mamba-based language model. Furthermore, we introduce a novel duplex decoding strategy that enables DuplexMamba to process input and generate output simultaneously. Experimental results demonstrate that DuplexMamba successfully implements duplex and streaming capabilities while achieving performance comparable to several recently developed Transformer-based models in automatic speech recognition (ASR) tasks and voice assistant benchmark evaluations. Our code and model are released. </p> </div> </dd> <dt> <a name='item76'>[76]</a> <a href ="/abs/2502.14614" title="Abstract" id="2502.14614"> arXiv:2502.14614 </a> (replaced) [<a href="/pdf/2502.14614" title="Download PDF" id="pdf-2502.14614" aria-labelledby="pdf-2502.14614">pdf</a>, <a href="/format/2502.14614" title="Other formats" id="oth-2502.14614" aria-labelledby="oth-2502.14614">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FIND: Fine-grained Information Density Guided Adaptive Retrieval-Augmented Generation for Disease Diagnosis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jia,+M">Mingyi Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duan,+J">Junwen Duan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+Y">Yan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+J">Jianxin Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Retrieval-Augmented Large Language Models (LLMs), which integrate external knowledge into LLMs, have shown remarkable performance in various medical domains, including clinical diagnosis. However, existing RAG methods struggle to effectively assess task difficulty to make retrieval decisions, thereby failing to meet the clinical requirements for balancing efficiency and accuracy. So in this paper, we propose FIND (\textbf{F}ine-grained \textbf{In}formation \textbf{D}ensity Guided Adaptive RAG), a novel framework that improves the reliability of RAG in disease diagnosis scenarios. FIND incorporates a fine-grained adaptive control module to determine whether retrieval is necessary based on the information density of the input. By optimizing the retrieval process and implementing a knowledge filtering module, FIND ensures that the retrieval is better suited to clinical scenarios. Experiments on three Chinese electronic medical record datasets demonstrate that FIND significantly outperforms various baseline methods, highlighting its effectiveness in clinical diagnosis tasks. </p> </div> </dd> <dt> <a name='item77'>[77]</a> <a href ="/abs/2502.16923" title="Abstract" id="2502.16923"> arXiv:2502.16923 </a> (replaced) [<a href="/pdf/2502.16923" title="Download PDF" id="pdf-2502.16923" aria-labelledby="pdf-2502.16923">pdf</a>, <a href="https://arxiv.org/html/2502.16923v2" title="View HTML" id="html-2502.16923" aria-labelledby="html-2502.16923" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.16923" title="Other formats" id="oth-2502.16923" aria-labelledby="oth-2502.16923">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Systematic Survey of Automatic Prompt Optimization Techniques </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ramnath,+K">Kiran Ramnath</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+K">Kang Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+S">Sheng Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mishra,+S+S">Soumya Smruti Mishra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+X">Xuan Qi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+Z">Zhengyuan Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Woo,+S">Sangmin Woo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jeoung,+S">Sullam Jeoung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yawei Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Haozhu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+H">Han Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yuzhe Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+Z">Zhichao Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yun Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Balasubramaniam">Balasubramaniam Srinivasan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+Q">Qiaojing Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yueyan Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ding,+H">Haibo Ding</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+P">Panpan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cheong,+L+L">Lin Lee Cheong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 main pages, 31 total pages, 1 figure </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Since the advent of large language models (LLMs), prompt engineering has been a crucial step for eliciting desired responses for various Natural Language Processing (NLP) tasks. However, prompt engineering remains an impediment for end users due to rapid advances in models, tasks, and associated best practices. To mitigate this, Automatic Prompt Optimization (APO) techniques have recently emerged that use various automated techniques to help improve the performance of LLMs on various tasks. In this paper, we present a comprehensive survey summarizing the current progress and remaining challenges in this field. We provide a formal definition of APO, a 5-part unifying framework, and then proceed to rigorously categorize all relevant works based on their salient features therein. We hope to spur further research guided by our framework. </p> </div> </dd> <dt> <a name='item78'>[78]</a> <a href ="/abs/2503.04188" title="Abstract" id="2503.04188"> arXiv:2503.04188 </a> (replaced) [<a href="/pdf/2503.04188" title="Download PDF" id="pdf-2503.04188" aria-labelledby="pdf-2503.04188">pdf</a>, <a href="https://arxiv.org/html/2503.04188v2" title="View HTML" id="html-2503.04188" aria-labelledby="html-2503.04188" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.04188" title="Other formats" id="oth-2503.04188" aria-labelledby="oth-2503.04188">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Measuring temporal effects of agent knowledge by date-controlled tool use </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Xian,+R+P">R. Patrick Xian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cui,+Q">Qiming Cui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bauer,+S">Stefan Bauer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abbasi-Asl,+R">Reza Abbasi-Asl</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> under review, comments welcome </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR) </div> <p class='mathjax'> Temporal progression is an integral part of knowledge accumulation and update. Web search is frequently adopted as grounding for agent knowledge, yet an improper configuration affects the quality of the agent's responses. Here, we assess the agent behavior using distinct date-controlled tools (DCTs) as stress test to measure the knowledge variability of large language model (LLM) agents. We demonstrate the temporal effects of an LLM agent as a writing assistant, which uses web search to complete scientific publication abstracts. We show that the temporality of search engine translates into tool-dependent agent performance but can be alleviated with base model choice and explicit reasoning instructions such as chain-of-thought prompting. Our results indicate that agent design and evaluations should take a dynamical view and implement measures to account for the temporal influence of external resources to ensure reliability. </p> </div> </dd> <dt> <a name='item79'>[79]</a> <a href ="/abs/2503.15289" title="Abstract" id="2503.15289"> arXiv:2503.15289 </a> (replaced) [<a href="/pdf/2503.15289" title="Download PDF" id="pdf-2503.15289" aria-labelledby="pdf-2503.15289">pdf</a>, <a href="https://arxiv.org/html/2503.15289v2" title="View HTML" id="html-2503.15289" aria-labelledby="html-2503.15289" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15289" title="Other formats" id="oth-2503.15289" aria-labelledby="oth-2503.15289">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TROVE: A Challenge for Fine-Grained Text Provenance via Source Sentence Tracing and Relationship Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Junnan Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+M">Min Xiao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yining Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhai,+F">Feifei Zhai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+Y">Yu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zong,+C">Chengqing Zong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 15 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> LLMs have achieved remarkable fluency and coherence in text generation, yet their widespread adoption has raised concerns about content reliability and accountability. In high-stakes domains such as healthcare, law, and news, it is crucial to understand where and how the content is created. To address this, we introduce the Text pROVEnance (TROVE) challenge, designed to trace each sentence of a target text back to specific source sentences within potentially lengthy or multi-document inputs. Beyond identifying sources, TROVE annotates the fine-grained relationships (quotation, compression, inference, and others), providing a deep understanding of how each target sentence is formed. To benchmark TROVE, we construct our dataset by leveraging three public datasets covering 11 diverse scenarios (e.g., QA and summarization) in English and Chinese, spanning source texts of varying lengths (0-5k, 5-10k, 10k+), emphasizing the multi-document and long-document settings essential for provenance. To ensure high-quality data, we employ a three-stage annotation process: sentence retrieval, GPT provenance, and human provenance. We evaluate 11 LLMs under direct prompting and retrieval-augmented paradigms, revealing that retrieval is essential for robust performance, larger models perform better in complex relationship classification, and closed-source models often lead, yet open-source models show significant promise, particularly with retrieval augmentation. </p> </div> </dd> <dt> <a name='item80'>[80]</a> <a href ="/abs/2503.20960" title="Abstract" id="2503.20960"> arXiv:2503.20960 </a> (replaced) [<a href="/pdf/2503.20960" title="Download PDF" id="pdf-2503.20960" aria-labelledby="pdf-2503.20960">pdf</a>, <a href="https://arxiv.org/html/2503.20960v2" title="View HTML" id="html-2503.20960" aria-labelledby="html-2503.20960" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.20960" title="Other formats" id="oth-2503.20960" aria-labelledby="oth-2503.20960">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Modal Framing Analysis of News </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Arora,+A">Arnav Arora</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yadav,+S">Srishti Yadav</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Antoniak,+M">Maria Antoniak</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belongie,+S">Serge Belongie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Augenstein,+I">Isabelle Augenstein</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Computers and Society (cs.CY); Machine Learning (cs.LG) </div> <p class='mathjax'> Automated frame analysis of political communication is a popular task in computational social science that is used to study how authors select aspects of a topic to frame its reception. So far, such studies have been narrow, in that they use a fixed set of pre-defined frames and focus only on the text, ignoring the visual contexts in which those texts appear. Especially for framing in the news, this leaves out valuable information about editorial choices, which include not just the written article but also accompanying photographs. To overcome such limitations, we present a method for conducting multi-modal, multi-label framing analysis at scale using large (vision-)language models. Grounding our work in framing theory, we extract latent meaning embedded in images used to convey a certain point and contrast that to the text by comparing the respective frames used. We also identify highly partisan framing of topics with issue-specific frame analysis found in prior qualitative work. We demonstrate a method for doing scalable integrative framing analysis of both text and image in news, providing a more complete picture for understanding media bias. </p> </div> </dd> <dt> <a name='item81'>[81]</a> <a href ="/abs/2503.22036" title="Abstract" id="2503.22036"> arXiv:2503.22036 </a> (replaced) [<a href="/pdf/2503.22036" title="Download PDF" id="pdf-2503.22036" aria-labelledby="pdf-2503.22036">pdf</a>, <a href="https://arxiv.org/html/2503.22036v2" title="View HTML" id="html-2503.22036" aria-labelledby="html-2503.22036" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.22036" title="Other formats" id="oth-2503.22036" aria-labelledby="oth-2503.22036">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cognitive Prompts Using Guilford's Structure of Intellect Model </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kramer,+O">Oliver Kramer</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Large language models (LLMs) demonstrate strong language generation capabilities but often struggle with structured reasoning, leading to inconsistent or suboptimal problem-solving. To mitigate this limitation, Guilford's Structure of Intellect (SOI) model - a foundational framework from intelligence theory - is leveraged as the basis for cognitive prompt engineering. The SOI model categorizes cognitive operations such as pattern recognition, memory retrieval, and evaluation, offering a systematic approach to enhancing LLM reasoning and decision-making. This position paper presents a novel cognitive prompting approach for enforcing SOI-inspired reasoning for improving clarity, coherence, and adaptability in model responses. </p> </div> </dd> <dt> <a name='item82'>[82]</a> <a href ="/abs/2503.22444" title="Abstract" id="2503.22444"> arXiv:2503.22444 </a> (replaced) [<a href="/pdf/2503.22444" title="Download PDF" id="pdf-2503.22444" aria-labelledby="pdf-2503.22444">pdf</a>, <a href="https://arxiv.org/html/2503.22444v2" title="View HTML" id="html-2503.22444" aria-labelledby="html-2503.22444" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.22444" title="Other formats" id="oth-2503.22444" aria-labelledby="oth-2503.22444">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Laws in Scientific Discovery with AI and Robot Scientists </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+P">Pengsong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Heng Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+H">Huazhe Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+R">Renjun Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Z">Zhenting Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+C">Cong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Garg,+A">Animesh Garg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zhibin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ajoudani,+A">Arash Ajoudani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xinyu Liu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Robotics (cs.RO) </div> <p class='mathjax'> Scientific discovery is poised for rapid advancement through advanced robotics and artificial intelligence. Current scientific practices face substantial limitations as manual experimentation remains time-consuming and resource-intensive, while multidisciplinary research demands knowledge integration beyond individual researchers' expertise boundaries. Here, we envision an autonomous generalist scientist (AGS) concept combines agentic AI and embodied robotics to automate the entire research lifecycle. This system could dynamically interact with both physical and virtual environments while facilitating the integration of knowledge across diverse scientific disciplines. By deploying these technologies throughout every research stage -- spanning literature review, hypothesis generation, experimentation, and manuscript writing -- and incorporating internal reflection alongside external feedback, this system aims to significantly reduce the time and resources needed for scientific discovery. Building on the evolution from virtual AI scientists to versatile generalist AI-based robot scientists, AGS promises groundbreaking potential. As these autonomous systems become increasingly integrated into the research process, we hypothesize that scientific discovery might adhere to new scaling laws, potentially shaped by the number and capabilities of these autonomous systems, offering novel perspectives on how knowledge is generated and evolves. The adaptability of embodied robots to extreme environments, paired with the flywheel effect of accumulating scientific knowledge, holds the promise of continually pushing beyond both physical and intellectual frontiers. </p> </div> </dd> <dt> <a name='item83'>[83]</a> <a href ="/abs/2504.00030" title="Abstract" id="2504.00030"> arXiv:2504.00030 </a> (replaced) [<a href="/pdf/2504.00030" title="Download PDF" id="pdf-2504.00030" aria-labelledby="pdf-2504.00030">pdf</a>, <a href="https://arxiv.org/html/2504.00030v2" title="View HTML" id="html-2504.00030" aria-labelledby="html-2504.00030" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.00030" title="Other formats" id="oth-2504.00030" aria-labelledby="oth-2504.00030">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Token-Driven GammaTune: Adaptive Calibration for Enhanced Speculative Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gautam,+A">Aayush Gautam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shrestha,+S">Susav Shrestha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Reddy,+N">Narasimha Reddy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 2 figures, 1 table </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> Speculative decoding accelerates large language model (LLM) inference by using a smaller draft model to propose tokens, which are then verified by a larger target model. However, selecting an optimal speculation length is critical for maximizing speedup while minimizing wasted computation. We introduce \textit{GammaTune} and \textit{GammaTune+}, training-free adaptive algorithms that dynamically adjust speculation length based on token acceptance rates using a heuristic-based switching mechanism. Evaluated on SpecBench across multiple tasks and model pairs, our method outperforms other heuristic-based approaches and fixed-length speculative decoding, achieving an average speedup of 15\% ($\pm$5\%) with \textit{GammaTune} and 16\% ($\pm$3\%) with \textit{GammaTune+}, while reducing performance variance. This makes \textit{GammaTune} a robust and efficient solution for real-world deployment. </p> </div> </dd> <dt> <a name='item84'>[84]</a> <a href ="/abs/2504.00824" title="Abstract" id="2504.00824"> arXiv:2504.00824 </a> (replaced) [<a href="/pdf/2504.00824" title="Download PDF" id="pdf-2504.00824" aria-labelledby="pdf-2504.00824">pdf</a>, <a href="https://arxiv.org/html/2504.00824v2" title="View HTML" id="html-2504.00824" aria-labelledby="html-2504.00824" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.00824" title="Other formats" id="oth-2504.00824" aria-labelledby="oth-2504.00824">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ScholarCopilot: Training Large Language Models for Academic Writing with Accurate Citations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yubo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+X">Xueguang Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nie,+P">Ping Nie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zeng,+H">Huaye Zeng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+Z">Zhiheng Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuxuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schneider,+B">Benjamin Schneider</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yi Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yue,+X">Xiang Yue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Wenhu Chen</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Academic writing requires both coherent text generation and precise citation of relevant literature. Although recent Retrieval-Augmented Generation (RAG) systems have significantly improved factual accuracy in general-purpose text generation, their ability to support professional academic writing remains limited. In this work, we introduce ScholarCopilot, a unified framework designed to enhance existing large language models for generating professional academic articles with accurate and contextually relevant citations. ScholarCopilot dynamically determines when to retrieve scholarly references by generating a retrieval token [RET], which is then used to query a citation database. The retrieved references are fed into the model to augment the generation process. We jointly optimize both the generation and citation tasks within a single framework to improve efficiency. Our model is built upon Qwen-2.5-7B and trained on 500K papers from arXiv. It achieves a top-1 retrieval accuracy of 40.1% on our evaluation dataset, outperforming baselines such as E5-Mistral-7B-Instruct (15.0%) and BM25 (9.8%). On a dataset of 1,000 academic writing samples, ScholarCopilot scores 16.2/25 in generation quality -- measured across relevance, coherence, academic rigor, completeness, and innovation -- significantly surpassing all existing models, including much larger ones like the Retrieval-Augmented Qwen2.5-72B-Instruct. Human studies further demonstrate that ScholarCopilot, despite being a 7B model, significantly outperforms ChatGPT, achieving 100% preference in citation quality and over 70% in overall usefulness. </p> </div> </dd> <dt> <a name='item85'>[85]</a> <a href ="/abs/2504.01346" title="Abstract" id="2504.01346"> arXiv:2504.01346 </a> (replaced) [<a href="/pdf/2504.01346" title="Download PDF" id="pdf-2504.01346" aria-labelledby="pdf-2504.01346">pdf</a>, <a href="https://arxiv.org/html/2504.01346v2" title="View HTML" id="html-2504.01346" aria-labelledby="html-2504.01346" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.01346" title="Other formats" id="oth-2504.01346" aria-labelledby="oth-2504.01346">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> GTR: Graph-Table-RAG for Cross-Table Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zou,+J">Jiaru Zou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+D">Dongqi Fu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+S">Sirui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+X">Xinrui He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Z">Zihao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+Y">Yada Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Han,+J">Jiawei Han</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Jingrui He</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 7 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Information Retrieval (cs.IR); Machine Learning (cs.LG) </div> <p class='mathjax'> Beyond pure text, a substantial amount of knowledge is stored in tables. In real-world scenarios, user questions often require retrieving answers that are distributed across multiple tables. GraphRAG has recently attracted much attention for enhancing LLMs' reasoning capabilities by organizing external knowledge to address ad-hoc and complex questions, exemplifying a promising direction for cross-table question answering. In this paper, to address the current gap in available data, we first introduce a multi-table benchmark, MutliTableQA, comprising 60k tables and 25k user queries collected from real-world sources. Then, we propose the first Graph-Table-RAG framework, namely GTR, which reorganizes table corpora into a heterogeneous graph, employs a hierarchical coarse-to-fine retrieval process to extract the most relevant tables, and integrates graph-aware prompting for downstream LLMs' tabular reasoning. Extensive experiments show that GTR exhibits superior cross-table question-answering performance while maintaining high deployment efficiency, demonstrating its real-world practical applicability. </p> </div> </dd> <dt> <a name='item86'>[86]</a> <a href ="/abs/2504.01667" title="Abstract" id="2504.01667"> arXiv:2504.01667 </a> (replaced) [<a href="/pdf/2504.01667" title="Download PDF" id="pdf-2504.01667" aria-labelledby="pdf-2504.01667">pdf</a>, <a href="https://arxiv.org/html/2504.01667v2" title="View HTML" id="html-2504.01667" aria-labelledby="html-2504.01667" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.01667" title="Other formats" id="oth-2504.01667" aria-labelledby="oth-2504.01667">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Testing Low-Resource Language Support in LLMs Using Language Proficiency Exams: the Case of Luxembourgish </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lothritz,+C">Cedric Lothritz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cabot,+J">Jordi Cabot</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 18 pages, 2 figures, 11 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span> </div> <p class='mathjax'> Large Language Models (LLMs) have become an increasingly important tool in research and society at large. While LLMs are regularly used all over the world by experts and lay-people alike, they are predominantly developed with English-speaking users in mind, performing well in English and other wide-spread languages while less-resourced languages such as Luxembourgish are seen as a lower priority. This lack of attention is also reflected in the sparsity of available evaluation tools and datasets. In this study, we investigate the viability of language proficiency exams as such evaluation tools for the Luxembourgish language. We find that large models such as ChatGPT, Claude and DeepSeek-R1 typically achieve high scores, while smaller models show weak performances. We also find that the performances in such language exams can be used to predict performances in other NLP tasks. </p> </div> </dd> <dt> <a name='item87'>[87]</a> <a href ="/abs/2504.01707" title="Abstract" id="2504.01707"> arXiv:2504.01707 </a> (replaced) [<a href="/pdf/2504.01707" title="Download PDF" id="pdf-2504.01707" aria-labelledby="pdf-2504.01707">pdf</a>, <a href="/format/2504.01707" title="Other formats" id="oth-2504.01707" aria-labelledby="oth-2504.01707">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InfiniteICL: Breaking the Limit of Context Window Size via Long Short-term Memory Transformation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Cao,+B">Bowen Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cai,+D">Deng Cai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lam,+W">Wai Lam</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In-context learning (ICL) is critical for large language models (LLMs), but its effectiveness is constrained by finite context windows, particularly in ultra-long contexts. To overcome this, we introduce InfiniteICL, a framework that parallels context and parameters in LLMs with short- and long-term memory in human cognitive systems, focusing on transforming temporary context knowledge into permanent parameter updates. This approach significantly reduces memory usage, maintains robust performance across varying input lengths, and theoretically enables infinite context integration through the principles of context knowledge elicitation, selection, and consolidation. Evaluations demonstrate that our method reduces context length by 90% while achieving 103% average performance of full-context prompting across fact recall, grounded reasoning, and skill acquisition tasks. When conducting sequential multi-turn transformations on complex, real-world contexts (with length up to 2M tokens), our approach surpasses full-context prompting while using only 0.4% of the original contexts. These findings highlight InfiniteICL's potential to enhance the scalability and efficiency of LLMs by breaking the limitations of conventional context window sizes. </p> </div> </dd> <dt> <a name='item88'>[88]</a> <a href ="/abs/2504.01919" title="Abstract" id="2504.01919"> arXiv:2504.01919 </a> (replaced) [<a href="/pdf/2504.01919" title="Download PDF" id="pdf-2504.01919" aria-labelledby="pdf-2504.01919">pdf</a>, <a href="https://arxiv.org/html/2504.01919v2" title="View HTML" id="html-2504.01919" aria-labelledby="html-2504.01919" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.01919" title="Other formats" id="oth-2504.01919" aria-labelledby="oth-2504.01919">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Bridging the Linguistic Divide: A Survey on Leveraging Large Language Models for Machine Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gain,+B">Baban Gain</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bandyopadhyay,+D">Dibyanayan Bandyopadhyay</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ekbal,+A">Asif Ekbal</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The advent of Large Language Models (LLMs) has significantly reshaped the landscape of machine translation (MT), particularly for low-resource languages and domains that lack sufficient parallel corpora, linguistic tools, and computational infrastructure. This survey presents a comprehensive overview of recent progress in leveraging LLMs for MT. We analyze techniques such as few-shot prompting, cross-lingual transfer, and parameter-efficient fine-tuning that enable effective adaptation to under-resourced settings. The paper also explores synthetic data generation strategies using LLMs, including back-translation and lexical augmentation. Additionally, we compare LLM-based translation with traditional encoder-decoder models across diverse language pairs, highlighting the strengths and limitations of each. We discuss persistent challenges such as hallucinations, evaluation inconsistencies, and inherited biases while also evaluating emerging LLM-driven metrics for translation quality. This survey offers practical insights and outlines future directions for building robust, inclusive, and scalable MT systems in the era of large-scale generative models. </p> </div> </dd> <dt> <a name='item89'>[89]</a> <a href ="/abs/2306.03819" title="Abstract" id="2306.03819"> arXiv:2306.03819 </a> (replaced) [<a href="/pdf/2306.03819" title="Download PDF" id="pdf-2306.03819" aria-labelledby="pdf-2306.03819">pdf</a>, <a href="https://arxiv.org/html/2306.03819v4" title="View HTML" id="html-2306.03819" aria-labelledby="html-2306.03819" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2306.03819" title="Other formats" id="oth-2306.03819" aria-labelledby="oth-2306.03819">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LEACE: Perfect linear concept erasure in closed form </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Belrose,+N">Nora Belrose</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schneider-Joseph,+D">David Schneider-Joseph</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ravfogel,+S">Shauli Ravfogel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Cotterell,+R">Ryan Cotterell</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raff,+E">Edward Raff</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Biderman,+S">Stella Biderman</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL); Computers and Society (cs.CY) </div> <p class='mathjax'> Concept erasure aims to remove specified features from an embedding. It can improve fairness (e.g. preventing a classifier from using gender or race) and interpretability (e.g. removing a concept to observe changes in model behavior). We introduce LEAst-squares Concept Erasure (LEACE), a closed-form method which provably prevents all linear classifiers from detecting a concept while changing the embedding as little as possible, as measured by a broad class of norms. We apply LEACE to large language models with a novel procedure called "concept scrubbing," which erases target concept information from every layer in the network. We demonstrate our method on two tasks: measuring the reliance of language models on part-of-speech information, and reducing gender bias in BERT embeddings. Code is available at <a href="https://github.com/EleutherAI/concept-erasure" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item90'>[90]</a> <a href ="/abs/2405.17537" title="Abstract" id="2405.17537"> arXiv:2405.17537 </a> (replaced) [<a href="/pdf/2405.17537" title="Download PDF" id="pdf-2405.17537" aria-labelledby="pdf-2405.17537">pdf</a>, <a href="https://arxiv.org/html/2405.17537v4" title="View HTML" id="html-2405.17537" aria-labelledby="html-2405.17537" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.17537" title="Other formats" id="oth-2405.17537" aria-labelledby="oth-2405.17537">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gong,+Z">ZeMing Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+A+T">Austin T. Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huo,+X">Xiaoliang Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haurum,+J+B">Joakim Bruslund Haurum</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lowe,+S+C">Scott C. Lowe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taylor,+G+W">Graham W. Taylor</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+A+X">Angel X. Chang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 31 pages with 14 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV) </div> <p class='mathjax'> Measuring biodiversity is crucial for understanding ecosystem health. While prior works have developed machine learning models for taxonomic classification of photographic images and DNA separately, in this work, we introduce a multimodal approach combining both, using CLIP-style contrastive learning to align images, barcode DNA, and text-based representations of taxonomic labels in a unified embedding space. This allows for accurate classification of both known and unknown insect species without task-specific fine-tuning, leveraging contrastive learning for the first time to fuse barcode DNA and image data. Our method surpasses previous single-modality approaches in accuracy by over 8% on zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. </p> </div> </dd> <dt> <a name='item91'>[91]</a> <a href ="/abs/2410.02179" title="Abstract" id="2410.02179"> arXiv:2410.02179 </a> (replaced) [<a href="/pdf/2410.02179" title="Download PDF" id="pdf-2410.02179" aria-labelledby="pdf-2410.02179">pdf</a>, <a href="https://arxiv.org/html/2410.02179v2" title="View HTML" id="html-2410.02179" aria-labelledby="html-2410.02179" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.02179" title="Other formats" id="oth-2410.02179" aria-labelledby="oth-2410.02179">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HATFormer: Historic Handwritten Arabic Text Recognition with Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chan,+A">Adrian Chan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mijar,+A">Anupam Mijar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saeed,+M">Mehreen Saeed</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wong,+C">Chau-Wai Wong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khater,+A">Akram Khater</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Arabic handwritten text recognition (HTR) is challenging, especially for historical texts, due to diverse writing styles and the intrinsic features of Arabic script. Additionally, Arabic handwriting datasets are smaller compared to English ones, making it difficult to train generalizable Arabic HTR models. To address these challenges, we propose HATFormer, a transformer-based encoder-decoder architecture that builds on a state-of-the-art English HTR model. By leveraging the transformer's attention mechanism, HATFormer captures spatial contextual information to address the intrinsic challenges of Arabic script through differentiating cursive characters, decomposing visual representations, and identifying diacritics. Our customization to historical handwritten Arabic includes an image processor for effective ViT information preprocessing, a text tokenizer for compact Arabic text representation, and a training pipeline that accounts for a limited amount of historic Arabic handwriting data. HATFormer achieves a character error rate (CER) of 8.6% on the largest public historical handwritten Arabic dataset, with a 51% improvement over the best baseline in the literature. HATFormer also attains a comparable CER of 4.2% on the largest private non-historical dataset. Our work demonstrates the feasibility of adapting an English HTR method to a low-resource language with complex, language-specific challenges, contributing to advancements in document digitization, information retrieval, and cultural preservation. </p> </div> </dd> <dt> <a name='item92'>[92]</a> <a href ="/abs/2410.11377" title="Abstract" id="2410.11377"> arXiv:2410.11377 </a> (replaced) [<a href="/pdf/2410.11377" title="Download PDF" id="pdf-2410.11377" aria-labelledby="pdf-2410.11377">pdf</a>, <a href="https://arxiv.org/html/2410.11377v2" title="View HTML" id="html-2410.11377" aria-labelledby="html-2410.11377" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.11377" title="Other formats" id="oth-2410.11377" aria-labelledby="oth-2410.11377">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Framework for Adapting Human-Robot Interaction to Diverse User Groups </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Rosin,+T+P">Theresa Pekarek Rosin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hassouna,+V">Vanessa Hassouna</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+X">Xiaowen Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Krohm,+L">Luca Krohm</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kordt,+H">Henri-Leon Kordt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Beetz,+M">Michael Beetz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wermter,+S">Stefan Wermter</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published in the Proceedings of the 16th International Conference on Social Robotics (ICSR) 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Robotics (cs.RO)</span>; Computation and Language (cs.CL); Human-Computer Interaction (cs.HC) </div> <p class='mathjax'> To facilitate natural and intuitive interactions with diverse user groups in real-world settings, social robots must be capable of addressing the varying requirements and expectations of these groups while adapting their behavior based on user feedback. While previous research often focuses on specific demographics, we present a novel framework for adaptive Human-Robot Interaction (HRI) that tailors interactions to different user groups and enables individual users to modulate interactions through both minor and major interruptions. Our primary contributions include the development of an adaptive, ROS-based HRI framework with an open-source code base. This framework supports natural interactions through advanced speech recognition and voice activity detection, and leverages a large language model (LLM) as a dialogue bridge. We validate the efficiency of our framework through module tests and system trials, demonstrating its high accuracy in age recognition and its robustness to repeated user inputs and plan changes. </p> </div> </dd> <dt> <a name='item93'>[93]</a> <a href ="/abs/2412.01380" title="Abstract" id="2412.01380"> arXiv:2412.01380 </a> (replaced) [<a href="/pdf/2412.01380" title="Download PDF" id="pdf-2412.01380" aria-labelledby="pdf-2412.01380">pdf</a>, <a href="https://arxiv.org/html/2412.01380v2" title="View HTML" id="html-2412.01380" aria-labelledby="html-2412.01380" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.01380" title="Other formats" id="oth-2412.01380" aria-labelledby="oth-2412.01380">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient LLM Inference using Dynamic Input Pruning and Cache-Aware Masking </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Federici,+M">Marco Federici</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Belli,+D">Davide Belli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+Baalen,+M">Mart van Baalen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jalalirad,+A">Amir Jalalirad</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Skliar,+A">Andrii Skliar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Major,+B">Bence Major</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nagel,+M">Markus Nagel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Whatmough,+P">Paul Whatmough</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Main Text: 10 pages, 11 figures. Appendix: 6 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> While mobile devices provide ever more compute power, improvements in DRAM bandwidth are much slower. This is unfortunate for large language model (LLM) token generation, which is heavily memory-bound. Previous work has proposed to leverage natural dynamic activation sparsity in ReLU-activated LLMs to reduce effective DRAM bandwidth per token. However, more recent LLMs use SwiGLU instead of ReLU, which results in little inherent sparsity. While SwiGLU activations can be pruned based on magnitude, the resulting sparsity patterns are difficult to predict, rendering previous approaches ineffective. To circumvent this issue, our work introduces Dynamic Input Pruning (DIP): a predictor-free dynamic sparsification approach, which preserves accuracy with minimal fine-tuning. DIP can further use lightweight LoRA adapters to regain some performance lost during sparsification. Lastly, we describe a novel cache-aware masking strategy, which considers the cache state and activation magnitude to further increase cache hit rate, improving LLM token rate on mobile devices. DIP outperforms other methods in terms of accuracy, memory and throughput trade-offs across simulated hardware settings. On Phi-3-Medium, DIP achieves a 46\% reduction in memory and 40\% increase in throughput with $<$ 0.1 loss in perplexity when compared to streaming the dense model from Flash. The open source code for HW simulator, methods, and experiments in this paper is available at <a href="https://github.com/Qualcomm-AI-research/dynamic-sparsity" rel="external noopener nofollow" class="link-external link-https">this https URL</a> . </p> </div> </dd> <dt> <a name='item94'>[94]</a> <a href ="/abs/2501.00398" title="Abstract" id="2501.00398"> arXiv:2501.00398 </a> (replaced) [<a href="/pdf/2501.00398" title="Download PDF" id="pdf-2501.00398" aria-labelledby="pdf-2501.00398">pdf</a>, <a href="https://arxiv.org/html/2501.00398v2" title="View HTML" id="html-2501.00398" aria-labelledby="html-2501.00398" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.00398" title="Other formats" id="oth-2501.00398" aria-labelledby="oth-2501.00398">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TSPE: Task-Specific Prompt Ensemble for Improved Zero-Shot Audio Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Anand,+N">Nishit Anand</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seth,+A">Ashish Seth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Duraiswami,+R">Ramani Duraiswami</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Manocha,+D">Dinesh Manocha</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to SALMA Workshop ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Audio-language models (ALMs) excel in zero-shot audio classification, a task where models classify previously unseen audio clips at test time by leveraging descriptive natural language prompts. We introduce TSPE (Task-Specific Prompt Ensemble), a simple, training-free hard prompting method that boosts ALEs' zero-shot performance by customizing prompts for diverse audio classification tasks. Rather than using generic template-based prompts like "Sound of a car" we generate context-rich prompts, such as "Sound of a car coming from a tunnel". Specifically, we leverage label information to identify suitable sound attributes, such as "loud" and "feeble", and appropriate sound sources, such as "tunnel" and "street" and incorporate this information into the prompts used by Audio-Language Models (ALMs) for audio classification. Further, to enhance audio-text alignment, we perform prompt ensemble across TSPE-generated task-specific prompts. When evaluated on 12 diverse audio classification datasets, TSPE improves performance across ALMs by showing an absolute improvement of 1.23-16.36% over vanilla zero-shot evaluation. </p> </div> </dd> <dt> <a name='item95'>[95]</a> <a href ="/abs/2501.14846" title="Abstract" id="2501.14846"> arXiv:2501.14846 </a> (replaced) [<a href="/pdf/2501.14846" title="Download PDF" id="pdf-2501.14846" aria-labelledby="pdf-2501.14846">pdf</a>, <a href="/format/2501.14846" title="Other formats" id="oth-2501.14846" aria-labelledby="oth-2501.14846">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wormhole Memory: A Rubik's Cube for Cross-Dialogue Retrieval </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Libo Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The experimental process and code have been uploaded to the Github repository, the link is: <a href="https://github.com/brucewang123456789/GeniusTrail/tree/main/Wormhole%20Memory%20Module" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> In view of the gap in the current large language model in sharing memory across dialogues, this research proposes a wormhole memory module (WMM) to realize memory as a Rubik's cube that can be arbitrarily retrieved between different dialogues. Through simulation experiments, the researcher built an experimental framework based on the Python environment and used setting memory barriers to simulate the current situation where memories between LLMs dialogues are difficult to share. The CoQA development data set was imported into the experiment, and the feasibility of its cross-dialogue memory retrieval function was verified for WMM's nonlinear indexing and dynamic retrieval, and a comparative analysis was conducted with the capabilities of Titans and MemGPT memory modules. Experimental results show that WMM demonstrated the ability to retrieve memory across dialogues and the stability of quantitative indicators in eight experiments. It contributes new technical approaches to the optimization of memory management of LLMs and provides experience for the practical application in the future. </p> </div> </dd> <dt> <a name='item96'>[96]</a> <a href ="/abs/2502.11167" title="Abstract" id="2502.11167"> arXiv:2502.11167 </a> (replaced) [<a href="/pdf/2502.11167" title="Download PDF" id="pdf-2502.11167" aria-labelledby="pdf-2502.11167">pdf</a>, <a href="https://arxiv.org/html/2502.11167v3" title="View HTML" id="html-2502.11167" aria-labelledby="html-2502.11167" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11167" title="Other formats" id="oth-2502.11167" aria-labelledby="oth-2502.11167">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SURGE: On the Potential of Large Language Models as General-Purpose Surrogate Code Executors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lyu,+B">Bohan Lyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+S">Siqiao Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+Z">Zichen Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Q">Qi-An Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Jiaming Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> Neural surrogate models have emerged as powerful and efficient tools in data mining. Meanwhile, large language models (LLMs) have demonstrated remarkable capabilities in code-related tasks. We investigate a novel application: using LLMs as surrogate models for code execution prediction. Given LLMs' unique ability to understand and process diverse programs, they present a promising direction for building general-purpose surrogate models. To systematically investigate this capability, we introduce SURGE, a comprehensive benchmark with $1160$ problems covering $8$ key aspects: multi-language programming tasks, competition-level programming problems, repository-level code analysis, high-cost scientific computing, time-complexity-intensive algorithms, buggy code analysis, programs dependent on specific compilers or execution environments, and formal mathematical proof verification. Through extensive empirical analysis of $21$ open-source and proprietary LLMs, we examine scaling laws, data efficiency, and predictive accuracy. Our findings reveal important insights about the feasibility of LLMs as efficient surrogates for computational processes, with implications for automated software testing, program analysis, and computational resource optimization in data mining applications. Code and dataset are released at <a href="https://github.com/Imbernoulli/SURGE" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item97'>[97]</a> <a href ="/abs/2502.20576" title="Abstract" id="2502.20576"> arXiv:2502.20576 </a> (replaced) [<a href="/pdf/2502.20576" title="Download PDF" id="pdf-2502.20576" aria-labelledby="pdf-2502.20576">pdf</a>, <a href="https://arxiv.org/html/2502.20576v4" title="View HTML" id="html-2502.20576" aria-labelledby="html-2502.20576" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.20576" title="Other formats" id="oth-2502.20576" aria-labelledby="oth-2502.20576">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Smart Routing: Cost-Effective Multi-LLM Serving for Multi-Core AIOS </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mei,+K">Kai Mei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xu,+W">Wujiang Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+S">Shuhang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yongfeng Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span>; Computation and Language (cs.CL) </div> <p class='mathjax'> As large language models (LLMs) are increasingly deployed as service endpoints in systems, the surge in query volume creates significant scheduling challenges. Existing scheduling frameworks mainly target at latency optimization while neglecting the capability of LLMs to serve different level of queries, which could lead to computational resource waste. For example, those simple queries can be safely handled by small, fast and cheap LLMs, while those complex and difficult queries need to be handled by large, slow, and expensive LLMs. This paper addresses this challenge by proposing an efficient capability-cost coordinated scheduling framework, ECCOS, for multi-LLM serving, which explicitly constrains response quality and workload to optimize LLM inference cost. Specifically, it introduces the two-stage scheduling by designing a multi-objective predictor and a constrained optimizer. The predictor estimates both model capabilities and computational costs through training-based and retrieval-based approaches, while the optimizer determines cost-optimal assignments under quality and workload constraints. It also introduces QAServe, a dataset for sample-wise response quality and costs collected by zero-shot prompting different LLMs on knowledge QA and mathematical reasoning. Extensive experiments demonstrate that ECCOS improves success rates by 6.30% while reducing costs by 10.15% compared to existing methods, consuming less than 0.5% of LLM response time. The code is available at: <a href="https://github.com/agiresearch/ECCOS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>, and the proposed smart routing mechanism has been integrated into AIOS, the AI Agent Operating System, at <a href="https://github.com/agiresearch/AIOS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item98'>[98]</a> <a href ="/abs/2503.16980" title="Abstract" id="2503.16980"> arXiv:2503.16980 </a> (replaced) [<a href="/pdf/2503.16980" title="Download PDF" id="pdf-2503.16980" aria-labelledby="pdf-2503.16980">pdf</a>, <a href="/format/2503.16980" title="Other formats" id="oth-2503.16980" aria-labelledby="oth-2503.16980">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Token Dynamics: Towards Efficient and Dynamic Video Token Representation for Video Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+H">Haichao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fu,+Y">Yun Fu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This submission has been withdrawn due to non-scientific and personal reasons of the first author, with the understanding of all co-authors. The first author has requested that the work not be made public at this time. Future publication remains under discussion and exploration </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> Token-based video representation has emerged as a promising approach for enabling LLMs to interpret video content. However, existing token reduction, such as token pruning and token merging, often disrupt essential spatial-temporal positional embeddings, failing to adequately balance computational efficiency with fewer tokens. Consequently, these methods result in lengthy token sequences, limiting their applicability in scenarios requiring extreme token compression, such as video large language models. In this paper, we introduce the novel task of extreme short token reduction, aiming to represent extensive video sequences with a minimal number of tokens. To address this challenge, we propose Token Dynamics, a new video representation framework that dynamically reduces token count while preserving spatial-temporal coherence. Specifically, we disentangle video representations by separating visual embeddings from grid-level motion information, structuring them into: 1. a concise token hash table, created by clustering tokens that describe object-level content; 2. a token indices key map, capturing detailed spatial-temporal motion patterns across grids; 3. a token hash function, which vector-quantizes the token hash table to reconstruct the token sequence from the key map. Furthermore, we introduce a cross-dynamics attention mechanism that integrates motion features into the token base without increasing token length, thereby maintaining compactness and spatial-temporal integrity. The experiments demonstrate a reduction of token count to merely 0.07% of the original tokens, with only a minor performance drop of 1.13%. Additionally, we propose two novel subtasks within extreme token reduction (fixed-length and adaptive-length compression). Our method offers significantly lower theoretical complexity, fewer tokens, and enhanced throughput, thus providing an efficient solution for video LLMs. </p> </div> </dd> <dt> <a name='item99'>[99]</a> <a href ="/abs/2503.20871" title="Abstract" id="2503.20871"> arXiv:2503.20871 </a> (replaced) [<a href="/pdf/2503.20871" title="Download PDF" id="pdf-2503.20871" aria-labelledby="pdf-2503.20871">pdf</a>, <a href="https://arxiv.org/html/2503.20871v3" title="View HTML" id="html-2503.20871" aria-labelledby="html-2503.20871" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.20871" title="Other formats" id="oth-2503.20871" aria-labelledby="oth-2503.20871">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VinaBench: Benchmark for Faithful and Consistent Visual Narratives </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+S">Silin Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mathew,+S">Sheryl Mathew</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mi,+L">Li Mi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mamooler,+S">Sepideh Mamooler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+M">Mengjie Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wakaki,+H">Hiromi Wakaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mitsufuji,+Y">Yuki Mitsufuji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Montariol,+S">Syrielle Montariol</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bosselut,+A">Antoine Bosselut</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) </div> <p class='mathjax'> Visual narrative generation transforms textual narratives into sequences of images illustrating the content of the text. However, generating visual narratives that are faithful to the input text and self-consistent across generated images remains an open challenge, due to the lack of knowledge constraints used for planning the stories. In this work, we propose a new benchmark, VinaBench, to address this challenge. Our benchmark annotates the underlying commonsense and discourse constraints in visual narrative samples, offering systematic scaffolds for learning the implicit strategies of visual storytelling. Based on the incorporated narrative constraints, we further propose novel metrics to closely evaluate the consistency of generated narrative images and the alignment of generations with the input textual narrative. Our results across three generative vision models demonstrate that learning with VinaBench's knowledge constraints effectively improves the faithfulness and cohesion of generated visual narratives. </p> </div> </dd> <dt> <a name='item100'>[100]</a> <a href ="/abs/2503.22879" title="Abstract" id="2503.22879"> arXiv:2503.22879 </a> (replaced) [<a href="/pdf/2503.22879" title="Download PDF" id="pdf-2503.22879" aria-labelledby="pdf-2503.22879">pdf</a>, <a href="https://arxiv.org/html/2503.22879v2" title="View HTML" id="html-2503.22879" aria-labelledby="html-2503.22879" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.22879" title="Other formats" id="oth-2503.22879" aria-labelledby="oth-2503.22879">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Quamba2: A Robust and Scalable Post-training Quantization Framework for Selective State Space Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chiang,+H">Hung-Yueh Chiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+C">Chi-Chih Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Frumkin,+N">Natalia Frumkin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+K">Kai-Chiang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Abdelfattah,+M+S">Mohamed S. Abdelfattah</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Marculescu,+D">Diana Marculescu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Performance (cs.PF) </div> <p class='mathjax'> State Space Models (SSMs) are emerging as a compelling alternative to Transformers because of their consistent memory usage and high performance. Despite this, scaling up SSMs on cloud services or limited-resource devices is challenging due to their storage requirements and computational power. To overcome this, quantizing SSMs with low bit-width data formats can reduce model size and benefit from hardware acceleration. As SSMs are prone to quantization-induced errors, recent efforts have focused on optimizing a particular model or bit-width for efficiency without sacrificing performance. However, distinct bit-width configurations are essential for different scenarios, like W4A8 for boosting large-batch decoding speed, and W4A16 for enhancing generation speed in short prompt applications for a single user. To this end, we present Quamba2, compatible with W8A8, W4A8, and W4A16 for both Mamba1 and Mamba2 backbones, addressing the growing demand for SSM deployment on various platforms. Based on the channel order preserving and activation persistence of SSMs, we propose an offline approach to quantize inputs of a linear recurrence in 8-bit by sorting and clustering for input $x$, combined with a per-state-group quantization for input-dependent parameters $B$ and $C$. To ensure compute-invariance in the SSM output, we rearrange weights offline according to the clustering sequence. The experiments show that Quamba2-8B outperforms several state-of-the-art SSM quantization methods and delivers 1.3$\times$ and 3$\times$ speed-ups in the pre-filling and generation stages, respectively, while offering 4$\times$ memory reduction with only a $1.6\%$ average accuracy drop. The evaluation on MMLU shows the generalizability and robustness of our framework. The code and quantized models will be released at: <a href="https://github.com/enyac-group/Quamba" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item101'>[101]</a> <a href ="/abs/2503.23037" title="Abstract" id="2503.23037"> arXiv:2503.23037 </a> (replaced) [<a href="/pdf/2503.23037" title="Download PDF" id="pdf-2503.23037" aria-labelledby="pdf-2503.23037">pdf</a>, <a href="https://arxiv.org/html/2503.23037v2" title="View HTML" id="html-2503.23037" aria-labelledby="html-2503.23037" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.23037" title="Other formats" id="oth-2503.23037" aria-labelledby="oth-2503.23037">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Agentic Large Language Models, a survey </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Plaat,+A">Aske Plaat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+Duijn,+M">Max van Duijn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+Stein,+N">Niki van Stein</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Preuss,+M">Mike Preuss</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+der+Putten,+P">Peter van der Putten</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Batenburg,+K+J">Kees Joost Batenburg</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Website: <a href="https://askeplaat.github.io/agentic-llm-survey-site/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Artificial Intelligence (cs.AI)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> There is great interest in agentic LLMs, large language models that act as agents. We review the growing body of work in this area and provide a research agenda. Agentic LLMs are LLMs that (1) reason, (2) act, and (3) interact. We organize the literature according to these three categories. The research in the first category focuses on reasoning, reflection, and retrieval, aiming to improve decision making; the second category focuses on action models, robots, and tools, aiming for agents that act as useful assistants; the third category focuses on multi-agent systems, aiming for collaborative task solving and simulating interaction to study emergent social behavior. We find that works mutually benefit from results in other categories: retrieval enables tool use, reflection improves multi-agent collaboration, and reasoning benefits all categories. We discuss applications of agentic LLMs and provide an agenda for further research. Important applications are in medical diagnosis, logistics and financial market analysis. Meanwhile, self-reflective agents playing roles and interacting with one another augment the process of scientific research itself. Further, agentic LLMs may provide a solution for the problem of LLMs running out of training data: inference-time behavior generates new training states, such that LLMs can keep learning without needing ever larger datasets. We note that there is risk associated with LLM assistants taking action in the real world, while agentic LLMs are also likely to benefit society. </p> </div> </dd> <dt> <a name='item102'>[102]</a> <a href ="/abs/2504.01281" title="Abstract" id="2504.01281"> arXiv:2504.01281 </a> (replaced) [<a href="/pdf/2504.01281" title="Download PDF" id="pdf-2504.01281" aria-labelledby="pdf-2504.01281">pdf</a>, <a href="/format/2504.01281" title="Other formats" id="oth-2504.01281" aria-labelledby="oth-2504.01281">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Scaling Test-Time Inference with Policy-Optimized, Dynamic Retrieval-Augmented Generation via KV Caching and Decoding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sagar,+S">Sakhinana Sagar Srinivas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Runkana,+V">Venkataramana Runkana</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Information Retrieval (cs.IR) </div> <p class='mathjax'> We present a comprehensive framework for enhancing Retrieval-Augmented Generation (RAG) systems through dynamic retrieval strategies and reinforcement fine-tuning. This approach significantly improves large language models on knowledge-intensive tasks, including opendomain question answering and complex reasoning. Our framework integrates two complementary techniques: Policy-Optimized RetrievalAugmented Generation (PORAG), which optimizes the use of retrieved information, and Adaptive Token-Layer Attention Scoring (ATLAS), which dynamically determines retrieval timing and content based on contextual needs. Together, these techniques enhance both the utilization and relevance of retrieved content, improving factual accuracy and response quality. Designed as a lightweight solution compatible with any Transformer-based LLM without requiring additional training, our framework excels in knowledge-intensive tasks, boosting output accuracy in RAG settings. We further propose CRITIC, a novel method to selectively compress key-value caches by token importance, mitigating memory bottlenecks in long-context applications. The framework also incorporates test-time scaling techniques to dynamically balance reasoning depth and computational resources, alongside optimized decoding strategies for faster inference. Experiments on benchmark datasets show that our framework reduces hallucinations, strengthens domain-specific reasoning, and achieves significant efficiency and scalability gains over traditional RAG systems. This integrated approach advances the development of robust, efficient, and scalable RAG systems across diverse applications. </p> </div> </dd> </dl> <div class='paging'>Total of 102 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.CL/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em">  <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>   </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>